# Feature engineering

**Purpose of script:**

Creating new features 

- In: dataframe_plain
- Out: dataframe_extended (with additional feature columns)

In [33]:
import xarray
import rasterio
import gemgis as gg
import pandas as pd
import numpy as np
from tqdm import tqdm
from astral.sun import sun
from astral import LocationInfo
import pyproj
from math import asin, cos, atan, tan, pi, acos, sin 

import datetime as dt

from os import listdir
from os.path import isfile, join
import scipy

##### Relevant paths

In [2]:
path_dataframe_plain = r"../Data/combined/dataframe_plain/"
mw_path = r"../Data/microwave-rs/mw_interpolated/"
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
out_path = r"../Data/combined/dataframe_extended/"

## Features

##### Row and column

In [3]:
def add_row_and_col(df):
        # add row and column features:
        df['col'] = df.groupby("x").ngroup() # xshape 2663 
        df['row'] = df.groupby("y").ngroup(ascending=False) # yshape 1462
        return df

##### Date

In [4]:
def add_date_year(df, melt_date):
    date = pd.to_datetime(melt_date).date()
    df['date'] = date
    df['year'] = date.year
    return df

##### Aggregated/pooled values

In [5]:
from typing import Union
from typing import Tuple
from scipy.stats import mode
from scipy.signal import convolve2d

def get_window(image: np.ndarray, window_size: int, center: Tuple[int, int]) -> np.ndarray:
    top = max(center[0] - window_size // 2, 0)
    bottom = min(center[0] + window_size // 2 + 1, image.shape[0])
    left = max(center[1] - window_size // 2, 0)
    right = min(center[1] + window_size // 2 + 1, image.shape[1])
    window = image[top:bottom, left:right]
    return window


# need to fix? : only calculate if the middle value is not nan - else all nan columns around 1 and 0 are going to have a value.

def convolve(image, window_size, convolution_fn: Union['mean', 'min', 'max', 'sum']):
    image = image[0].values
    image[image == -1] = np.nan
    
    if convolution_fn == 'mean':
        kernel = np.ones((window_size, window_size))  # kernel for mean convolution
        result = np.zeros_like(image, dtype=np.float64)
        # Compute the sum and count of non-NaN values in the kernel window
        counts = convolve2d(~np.isnan(image), kernel, mode='same', boundary='fill', fillvalue=0)
        sums = convolve2d(np.nan_to_num(image), kernel, mode='same', boundary='fill', fillvalue=0)
        # Calculate the mean, ignoring NaN values
        result[counts > 0] = sums[counts > 0] / counts[counts > 0]
        # Set the output to NaN where all values in the kernel window are NaN
        result[counts == 0] = np.nan
        return result
        
    elif convolution_fn == 'max':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nanmax(non_nan_values)

    elif convolution_fn == 'min':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nanmin(non_nan_values)
        return result

    elif convolution_fn == 'sum':
        result = np.zeros_like(image, dtype=np.float64)
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                window = get_window(image, window_size, (i, j))
                non_nan_values = window[~np.isnan(window)]
                if len(non_nan_values) == 0:
                    result[i, j] = np.nan
                else:
                    result[i, j] = np.nansum(non_nan_values)
        return result
        
    else: 
        print('not available function')
    return

In [6]:
def convolution_to_df(convolution_raster, column_name):
    nrows, ncols = convolution_raster.shape
    # create an array of x and y positions
    x = np.tile(np.arange(ncols), nrows)
    y = np.repeat(np.arange(nrows), ncols)
    # create a DataFrame with x, y, and pixel values as columns
    df = pd.DataFrame({'col': x, 'row': y, column_name: convolution_raster.flatten()})
    return df 

##### Elevation data

In [7]:
def add_elevation(data):
    df = data.to_dataframe()
    df = df.reset_index()
    df = df[['x', 'y', 'band_data']]
    df.rename({'band_data': 'elevation_data'}, axis=1, inplace=True)
    return df

##### Slope

Slope is given as degree of incline angle: 0 means flat (no slope == horizontal), 90 means (most possible slope == vertical)

In [8]:
def get_slope(data):
    slope = gg.raster.calculate_slope(data)
    nrows, ncols = slope.shape
    # create an array of x and y positions
    x = np.tile(np.arange(ncols), nrows)
    y = np.repeat(np.arange(nrows), ncols)
    # create a DataFrame with x, y, and pixel values as columns
    df_slope = pd.DataFrame({'col': x, 'row': y, 'slope_data': slope.flatten()})
    return df_slope

##### Aspect

Aspect is given as cosine radian: 0 and 360 degree = 1, 180 degree = -1

In [9]:
def get_aspect(data):
    aspect = gg.raster.calculate_aspect(data)
    nrows, ncols = aspect.shape
    # create an array of x and y positions
    x = np.tile(np.arange(ncols), nrows)
    y = np.repeat(np.arange(nrows), ncols)
    # create a DataFrame with x, y, and pixel values as columns
    df_aspect = pd.DataFrame({'col': x, 'row': y, 'aspect_data': aspect.flatten()})
    df_aspect["aspect_data"] = np.cos(df_aspect["aspect_data"] * np.pi / 180.)
    return df_aspect

##### Distance from margin/shore

In [10]:
def distance_to_margin():
    data_microwave = xarray.open_dataarray(mw_path + '2019-06-08_mw.tif') # any microwave file
    mw_val_masked = data_microwave[0].values
    mw_val_masked = np.copy(mw_val_masked)
    mw_val_masked[mw_val_masked==1]=0
    dist_in_pixels = scipy.ndimage.morphology.distance_transform_edt(mw_val_masked==0, return_distances= True)
    return dist_in_pixels

##### Array to DF

In [11]:
def array_to_df(convolution_raster, column_name):
    nrows, ncols = convolution_raster.shape
    # create an array of x and y positions
    x = np.tile(np.arange(ncols), nrows)
    y = np.repeat(np.arange(nrows), ncols)
    # create a DataFrame with x, y, and pixel values as columns
    df = pd.DataFrame({'col': x, 'row': y, column_name: convolution_raster.flatten()})
    return df 

## Main:

In [12]:
def get_files(mw_path, path_dataframe_plain):
    # get plain files:
    df_plain_files = [f for f in listdir(path_dataframe_plain) if isfile(join(path_dataframe_plain, f))]
    # microwave files:
    mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
    return  mw_files, df_plain_files

In [13]:
def main(mw_files_list, df_plain_files_list, path_elevation, out_path, write = False):
    # get plain files:
    df_plain_files = df_plain_files_list
    # microwave files:
    mw_files = mw_files_list
    # load elevation data:
    data_elevation_xarray = xarray.open_dataarray(path_elevation)
    data_elevation_rasterio = rasterio.open(path_elevation)
    # calculate distance to margin:
    distance_margin = distance_to_margin()

    for df_file in df_plain_files:
        melt_date =  df_file[5:15]
        print(melt_date)
        for mw_file in mw_files:
            if mw_file.startswith(melt_date):
                data_mw = xarray.open_dataarray(mw_path + mw_file)
                df = pd.read_parquet(path_dataframe_plain + df_file)
                # add row and column features:
                df = add_row_and_col(df)
                # get convolutions:
                df_conv_mean_3 = array_to_df(convolve(data_mw, 3, 'mean'), 'mean_3')
                df_conv_mean_9 = array_to_df(convolve(data_mw, 9, 'mean'), 'mean_9')
                df_conv_sum_5 = array_to_df(convolve(data_mw, 5, 'sum'), 'sum_5')
                # merge convolution:
                df_combined = pd.merge(df, df_conv_mean_3, how = 'left', on = ['row', 'col'])
                df_combined = pd.merge(df_combined, df_conv_mean_9, how = 'left', on = ['row', 'col'])
                df_combined = pd.merge(df_combined, df_conv_sum_5, how = 'left', on = ['row', 'col'])
                # remove water in mw:
                df_combined = df_combined.loc[df_combined['mw_value'] != -1]
                # add date:
                df = add_date_year(df_combined, melt_date)
                # add and merge elevation data:
                df_elevation = add_elevation(data_elevation_xarray)
                df = pd.merge(df, df_elevation, how = 'left', on = ['y', 'x'])
                # get and merge slope data:
                df_slope = get_slope(data_elevation_rasterio)
                df = pd.merge(df, df_slope[["slope_data"]], how="left", right_index=True, left_index=True)
                # get and merge aspect data:
                df_aspect = get_aspect(data_elevation_rasterio)
                df = pd.merge(df, df_aspect[["aspect_data"]], how="left", right_index=True, left_index=True) 
                # add and merge distance to margin data:
                df_distance = array_to_df(distance_margin, 'distance_to_margin')
                df = pd.merge(df, df_distance, how = 'left', on = ['row', 'col'])
                
                # write to parquet:
                if write == True:
                    df.to_parquet(out_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False)                    
    return df
                
                

Main

In [None]:
#main(mw_path, path_dataframe_plain, path_elevation, out_path)
main(*get_files(mw_path, path_dataframe_plain), path_elevation, out_path)# , write = True)

## Testing

### Testing Lina

In [14]:
df = main(['2019-06-08_mw.tif'], ['melt_2019-06-08.parquet.gzip'], path_elevation, out_path, write= True)  

2019-06-08


In [35]:
t = df.copy()

Unnamed: 0,x,y,mw_value,opt_value,col,row,mean_3,mean_9,sum_5,date,year,elevation_data,slope_data,aspect_data,distance_to_margin
0,-636500.00,-662500.00,0.00,-1.00,0,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00
1,-635500.00,-662500.00,0.00,-1.00,1,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00
2,-634500.00,-662500.00,0.00,-1.00,2,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00
3,-633500.00,-662500.00,0.00,-1.00,3,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00
4,-632500.00,-662500.00,0.00,-1.00,4,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278720,58500.00,-3324500.00,0.00,-1.00,695,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.15,0.93,5.00
2278721,59500.00,-3324500.00,0.00,-1.00,696,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.13,0.89,4.00
2278722,60500.00,-3324500.00,0.00,-1.00,697,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.06,0.89,3.00
2278723,61500.00,-3324500.00,0.00,-1.00,698,2662,0.00,0.00,0.00,2019-06-08,2019,45.00,0.06,-0.45,2.00


In [103]:
# # for testing values around the 0-1 change in the data:

# tt = data_mw.values
# # indices = np.where(tt == 1)
# tt[0][74:80, 622:628]

In [36]:
# Define the source and destination coordinate reference systems
src_crs = pyproj.CRS.from_epsg(3413)  # WGS84 (longitude, latitude)
dst_crs = pyproj.CRS.from_epsg(4326)  # Web Mercator (used by most online maps)

# Define the transformer object
transformer = pyproj.Transformer.from_crs(src_crs, dst_crs)
# Convert all coordinates at once
lats, longs = transformer.transform(df["x"], df["y"])

lats

array([81.53389148, 81.54025102, 81.54660541, ..., 59.98367852,
       59.98352011, 59.98335911])

In [38]:
df["lats"] = lats
df

Unnamed: 0,x,y,mw_value,opt_value,col,row,mean_3,mean_9,sum_5,date,year,elevation_data,slope_data,aspect_data,distance_to_margin,lats
0,-636500.00,-662500.00,0.00,-1.00,0,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00,81.53
1,-635500.00,-662500.00,0.00,-1.00,1,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00,81.54
2,-634500.00,-662500.00,0.00,-1.00,2,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00,81.55
3,-633500.00,-662500.00,0.00,-1.00,3,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00,81.55
4,-632500.00,-662500.00,0.00,-1.00,4,0,0.00,0.00,0.00,2019-06-08,2019,14.00,0.00,1.00,150.00,81.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278720,58500.00,-3324500.00,0.00,-1.00,695,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.15,0.93,5.00,59.98
2278721,59500.00,-3324500.00,0.00,-1.00,696,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.13,0.89,4.00,59.98
2278722,60500.00,-3324500.00,0.00,-1.00,697,2662,0.00,0.00,0.00,2019-06-08,2019,44.00,0.06,0.89,3.00,59.98
2278723,61500.00,-3324500.00,0.00,-1.00,698,2662,0.00,0.00,0.00,2019-06-08,2019,45.00,0.06,-0.45,2.00,59.98
