# Feature engineering

**Purpose of script:**

Creating new features 

- In: dataframe_plain
- Out: dataframe_extended (with additional feature columns)

In [2]:
import xarray
import pandas as pd
import numpy as np
from tqdm import tqdm

from os import listdir
from os.path import isfile, join

Relevant paths

In [3]:
path_dataframe_plain = r"../Data/combined/dataframe_plain/"
mw_path = r"../Data/microwave-rs/mw_interpolated/"
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
out_path = r"../Data/combined/dataframe_extended/"

## Features

Row and column

In [4]:
def add_row_and_col(df):
        # add row and column features:
        df['col'] = df.groupby("x").ngroup() # xshape 2663 
        df['row'] = df.groupby("y").ngroup(ascending=False) # yshape 1462
        return df

Date

In [5]:
def add_date(df):
    df['date'] = '2019-07-01'
    df["date"]= pd.to_datetime(df["date"])
    # df['month'] = df["date"].dt.month
    return df

Aggregated/pooled values

In [6]:
def get_neighbors(matrix, a, b):
    neighbors = [matrix[i][j] if (i > -1 and j > -1 and j < len(matrix[0]) and i < len(matrix)) else np.nan for i in range(a-1, a+2) for j in range(b-1, b+2) ]
    return neighbors

In [7]:
def get_neighbours_df(data):
    index_list = [(i,j) for i in range(data.shape[1]) for j in range(data.shape[2])]
    value_list = []
    values = data.values[0]

    for i in tqdm(index_list):
        neighbor = get_neighbors(values, *i)
        neighbor += [i[0], i[1]]
        value_list.append(neighbor)

    df_neighbors = pd.DataFrame(value_list, columns = ['v1', 'v2', 'v3','v4', 'v5', 'v6','v7', 'v8', 'v9', 'row', 'col'])   
    return df_neighbors

In [8]:
def add_aggregated(df):
    cols = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9'] # delet v5? same as mw value
    df[cols] = df[cols].replace(-1, np.NaN) # to skip -1 when calculating mean
    df['mean'] = df[cols].mean(axis = 1) # mean value of 9 pixels around
    return df

Elevation data

In [9]:
def add_elevation(data):
    df = data.to_dataframe()
    df = df.reset_index()
    df = df[['x', 'y', 'band_data']]
    df.rename({'band_data': 'elevation_data'}, axis=1, inplace=True)
    return df

Distance from margin/shore

In [10]:
# tbd
# add if coast column - if at least one na but not all 

## Main:

In [11]:
def main(mw_path, path_dataframe_plain, path_elevation, out_path):
    # get plain files:
    df_plain_files = [f for f in listdir(path_dataframe_plain) if isfile(join(path_dataframe_plain, f))]
    # microwave files:
    mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
    # elevation file:
    data_elevation = xarray.open_dataarray(path_elevation)

    for df_file in df_plain_files:
        melt_date =  df_file[5:15]
        print(melt_date)
        for mw_file in mw_files:
            if mw_file.startswith(melt_date):
                data_mw = xarray.open_dataarray(mw_path + mw_file)
                df = pd.read_parquet(path_dataframe_plain + df_file)

                # add row and column features:
                df = add_row_and_col(df)
                # get neighbours:
                df_neighbors = get_neighbours_df(data_mw)
                # merge neighbours:
                df_combined = pd.merge(df, df_neighbors, how = 'left', on = ['row', 'col'])
                # remove water in mw:
                df_combined = df_combined.loc[df_combined['mw_value'] != -1] # suppress warning?
                # add date:
                df = add_date(df_combined)
                # add aggregations:
                df = add_aggregated(df)
                # add elevation data:
                df_elevation = add_elevation(data_elevation)
                # merge elevation data:
                df_with_elevation = pd.merge(df, df_elevation, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt
                               
                # write to parquet:
                df_with_elevation.to_parquet(out_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False)                    
    return
                

Main

In [12]:
main(mw_path, path_dataframe_plain, path_elevation, out_path)

2019-06-08


100%|██████████| 3893306/3893306 [00:33<00:00, 117824.02it/s]
