# Feature engineering

**Purpose of script:**

Creating new features 

- In: dataframe_plain
- Out: dataframe_extended (with additional feature columns)

In [1]:
import xarray
import pandas as pd
import numpy as np
from tqdm import tqdm

from os import listdir
from os.path import isfile, join

##### Relevant paths

In [2]:
path_dataframe_plain = r"../Data/combined/dataframe_plain/"
mw_path = r"../Data/microwave-rs/mw_interpolated/"
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
out_path = r"../Data/combined/dataframe_extended/"

## Features

##### Row and column

In [3]:
def add_row_and_col(df):
        # add row and column features:
        df['col'] = df.groupby("x").ngroup() # xshape 2663 
        df['row'] = df.groupby("y").ngroup(ascending=False) # yshape 1462
        return df

##### Date

In [4]:
def add_date(df):
    df['date'] = '2019-07-01'
    df["date"]= pd.to_datetime(df["date"])
    # df['month'] = df["date"].dt.month
    return df

##### Aggregated/pooled values

In [5]:
# def get_neighbors(matrix, a, b):
#     neighbors = [matrix[i][j] if (i > -1 and j > -1 and j < len(matrix[0]) and i < len(matrix)) else np.nan for i in range(a-1, a+2) for j in range(b-1, b+2) ]
#     return neighbors

# def get_neighbours_df(data):
#     index_list = [(i,j) for i in range(data.shape[1]) for j in range(data.shape[2])]
#     value_list = []
#     values = data.values[0]

#     for i in tqdm(index_list):
#         neighbor = get_neighbors(values, *i)
#         neighbor += [i[0], i[1]]
#         value_list.append(neighbor)

#     df_neighbors = pd.DataFrame(value_list, columns = ['v1', 'v2', 'v3','v4', 'v5', 'v6','v7', 'v8', 'v9', 'row', 'col'])   
#     return df_neighbors

# def add_aggregated(df):
#     cols = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9'] # delet v5? same as mw value
#     df[cols] = df[cols].replace(-1, np.NaN) # to skip -1 when calculating mean
#     df['mean'] = df[cols].mean(axis = 1) # mean value of 9 pixels around
#     return df

In [132]:
from typing import Union
from typing import Tuple
from scipy.stats import mode
from scipy.signal import convolve2d

def get_window(image: np.ndarray, window_size: int, center: Tuple[int, int]) -> np.ndarray:
    top = max(center[0] - window_size // 2, 0)
    bottom = min(center[0] + window_size // 2 + 1, image.shape[0])
    left = max(center[1] - window_size // 2, 0)
    right = min(center[1] + window_size // 2 + 1, image.shape[1])
    window = image[top:bottom, left:right]
    return window


# need to fix? : only calculate if the middle value is not nan - else all nan columns around 1 and 0 are going to have a value.
# -1 to nan

def convolve(image, window_size, convolution_fn: Union['mean', 'min', 'max', 'sum', 'median', 'mode']):
    image = image[0].values
    image[image == -1] = np.nan
    
    if convolution_fn == 'mean':
        kernel = np.ones((window_size, window_size))  # kernel for mean convolution
        result = np.zeros_like(image, dtype=np.float64)
        # Compute the sum and count of non-NaN values in the kernel window
        counts = convolve2d(~np.isnan(image), kernel, mode='same', boundary='fill', fillvalue=0)
        sums = convolve2d(np.nan_to_num(image), kernel, mode='same', boundary='fill', fillvalue=0)
        # Calculate the mean, ignoring NaN values
        result[counts > 0] = sums[counts > 0] / counts[counts > 0]
        # Set the output to NaN where all values in the kernel window are NaN
        result[counts == 0] = np.nan
        return result
    elif convolution_fn == 'max':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nanmax(non_nan_values)
        return result

    elif convolution_fn == 'min':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nanmin(non_nan_values)
        return result

    elif convolution_fn == 'sum':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nansum(non_nan_values)
        return result
        
    elif convolution_fn == 'median':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nanmedian(non_nan_values)
        return result
    elif convolution_fn == 'mode':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = mode(non_nan_values)[0]
        return result
    else: 
        print('not available function')
    return

In [123]:
def convolution_to_df(convolution_raster):
    nrows, ncols = convolution_raster.shape
    # create an array of x and y positions
    x = np.tile(np.arange(ncols), nrows)
    y = np.repeat(np.arange(nrows), ncols)
    # create a DataFrame with x, y, and pixel values as columns
    df = pd.DataFrame({'col': x, 'row': y, 'mean': convolution_raster.flatten()})
    return df 


##### Elevation data

In [8]:
def add_elevation(data):
    df = data.to_dataframe()
    df = df.reset_index()
    df = df[['x', 'y', 'band_data']]
    df.rename({'band_data': 'elevation_data'}, axis=1, inplace=True)
    return df

##### Distance from margin/shore

In [9]:
# tbd
# add if coast column - if at least one na but not all 

## Main:

In [11]:
def main(mw_path, path_dataframe_plain, path_elevation, out_path):
    # get plain files:
    df_plain_files = [f for f in listdir(path_dataframe_plain) if isfile(join(path_dataframe_plain, f))]
    # microwave files:
    mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
    # elevation file:
    data_elevation = xarray.open_dataarray(path_elevation)

    for df_file in df_plain_files:
        melt_date =  df_file[5:15]
        print(melt_date)
        for mw_file in mw_files:
            if mw_file.startswith(melt_date):
                data_mw = xarray.open_dataarray(mw_path + mw_file)
                df = pd.read_parquet(path_dataframe_plain + df_file)
                # add row and column features:
                df = add_row_and_col(df)
                # get neighbours:
                # df_neighbors = get_neighbours_df(data_mw)
                # merge neighbours:
                # df_combined = pd.merge(df, df_neighbors, how = 'left', on = ['row', 'col'])
                # get convolution:
                convolution = convolve(data_mw, 3, 'mean')
                # convolution to df:
                df_conv = convolution_to_df(convolution)
                # merge convolution:
                df_combined = pd.merge(df, df_conv, how = 'left', on = ['row', 'col'])
                # remove water in mw:
                df_combined = df_combined.loc[df_combined['mw_value'] != -1] # suppress warning?
                # add date:
                df = add_date(df_combined)
                # add aggregations:
                # df = add_aggregated(df)
                # add elevation data:
                df_elevation = add_elevation(data_elevation)
                # merge elevation data:
                df_with_elevation = pd.merge(df, df_elevation, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt               
                # write to parquet:
                df_with_elevation.to_parquet(out_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False)                    
    return
                

Main

In [12]:
main(mw_path, path_dataframe_plain, path_elevation, out_path)

2019-06-08


100%|██████████| 3893306/3893306 [00:33<00:00, 117824.02it/s]


## Testing

In [135]:
melt_date =  '2019-06-08'
#data_elevation = xarray.open_dataarray(path_elevation)
data_mw = xarray.open_dataarray(mw_path + '2019-06-08_mw.tif')
df = pd.read_parquet(path_dataframe_plain + 'melt_2019-06-08.parquet.gzip')

# add row and column features:
df = add_row_and_col(df)
# # get neighbours:
# df_neighbors = get_neighbours_df(data_mw)
# print(data_mw)
# print(df_neighbors)

test1 = convolve(data_mw, 3, 'mean')
df_conv = convolution_to_df(test1)
df_combined = pd.merge(df, df_conv, how = 'left', on = ['row', 'col'])
# # merge neighbours:
# df_combined = pd.merge(df, df_neighbors, how = 'left', on = ['row', 'col'])
# # remove water in mw:
# df_combined = df_combined.loc[df_combined['mw_value'] != -1] # suppress warning?
# # add date:
# df = add_date(df_combined)
# # add aggregations:
# df = add_aggregated(df)


# # add elevation data:
# df_elevation = add_elevation(data_elevation)
# # merge elevation data:
# df_with_elevation = pd.merge(df, df_elevation, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt
                
# # write to parquet:
# df_with_elevation.to_parquet(out_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False)    

In [103]:
# for testing purposes:

tt = data_mw.values
# indices = np.where(tt == 1)
tt[0][74:80, 622:628]

In [39]:
# matrix1 = np.array([
#     [1, 2,      np.nan,   4, 5],
#     [6, np.nan,  8,       9, 10],
#     [11,        12, 13,  14, 15],
#     [16, np.nan, 18,    19, 20],
#     [21,  22,    23, np.nan, 25]
# ])

# import numpy as np
# from typing import Tuple
# from scipy.signal import convolve2d

# def convolve_by_mean(image, window_size):
#     kernel = np.ones((window_size, window_size))  # kernel for mean convolution
#     result = np.zeros_like(image, dtype=np.float64)
#     # Compute the sum and count of non-NaN values in the kernel window
#     counts = convolve2d(~np.isnan(image), kernel, mode='same', boundary='fill', fillvalue=0)
#     sums = convolve2d(np.nan_to_num(image), kernel, mode='same', boundary='fill', fillvalue=0)
#     # Calculate the mean, ignoring NaN values
#     result[counts > 0] = sums[counts > 0] / counts[counts > 0]
#     # Set the output to NaN where all values in the kernel window are NaN
#     result[counts == 0] = np.nan
#     return result

# def convolve_by_min(image: np.ndarray, window_size: int) -> np.ndarray:
#     result = np.zeros_like(image, dtype=np.float64)
#     for i in range(image.shape[0]):
#         for j in range(image.shape[1]):
#             window = get_window(image, window_size, (i, j))
#             non_nan_values = window[~np.isnan(window)]
#             if len(non_nan_values) == 0:
#                 result[i, j] = np.nan
#             else:
#                 result[i, j] = np.nanmin(non_nan_values)
#     return result

# def convolve_by_max(image: np.ndarray, window_size: int) -> np.ndarray:
#     result = np.zeros_like(image, dtype=np.float64)
#     for i in range(image.shape[0]):
#         for j in range(image.shape[1]):
#             window = get_window(image, window_size, (i, j))
#             non_nan_values = window[~np.isnan(window)]
#             if len(non_nan_values) == 0:
#                 result[i, j] = np.nan
#             else:
#                 result[i, j] = np.nanmax(non_nan_values)
#     return result

# def convolve_by_sum(image: np.ndarray, window_size: int) -> np.ndarray:
#     result = np.zeros_like(image, dtype=np.float64)
#     for i in range(image.shape[0]):
#         for j in range(image.shape[1]):
#             window = get_window(image, window_size, (i, j))
#             non_nan_values = window[~np.isnan(window)]
#             if len(non_nan_values) == 0:
#                 result[i, j] = np.nan
#             else:
#                 result[i, j] = np.nansum(non_nan_values)
#     return result

# def convolve_by_median(image: np.ndarray, window_size: int) -> np.ndarray:
#     result = np.zeros_like(image, dtype=np.float64)
#     for i in range(image.shape[0]):
#         for j in range(image.shape[1]):
#             window = get_window(image, window_size, (i, j))
#             non_nan_values = window[~np.isnan(window)]
#             if len(non_nan_values) == 0:
#                 result[i, j] = np.nan
#             else:
#                 result[i, j] = np.nanmedian(non_nan_values)
#     return result

# def convolve_by_mode(image: np.ndarray, window_size: int) -> np.ndarray:
#     result = np.zeros_like(image, dtype=np.float64)
#     for i in range(image.shape[0]):
#         for j in range(image.shape[1]):
#             window = get_window(image, window_size, (i, j))
#             non_nan_values = window[~np.isnan(window)]
#             if len(non_nan_values) == 0:
#                 result[i, j] = np.nan
#             else:
#                 result[i, j] = mode(non_nan_values)[0]
#     return result

# def get_window(image: np.ndarray, window_size: int, center: Tuple[int, int]) -> np.ndarray:
#     top = max(center[0] - window_size // 2, 0)
#     bottom = min(center[0] + window_size // 2 + 1, image.shape[0])
#     left = max(center[1] - window_size // 2, 0)
#     right = min(center[1] + window_size // 2 + 1, image.shape[1])
#     window = image[top:bottom, left:right]
#     return window
