# Feature engineering

**Purpose of script:**

Creating new features 

In [1]:
import pandas as pd
import numpy as np

import xarray
import rasterio

from os import listdir
from os.path import isfile, join

## Features:

In script 05:
- Column and row numbers
- Neighboring pixels
- Water pixels from mw removed 



Date

In [2]:
def add_date(df):
    df['date'] = '2019-07-01'
    df["date"]= pd.to_datetime(df["date"])
    # df['month'] = df["date"].dt.month
    return df

Aggregated/pooled values

In [3]:
def add_aggregated(df):
    cols = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9'] # delet v5? same as mw value
    df[cols] = df[cols].replace(-1, np.NaN) # to skip -1 when calculating mean
    df['mean'] = df[cols].mean(axis = 1) # mean value of 9 pixels around
    return df

Elevation data

In [4]:
# for exploration and testing:
# data_elevation = xarray.open_dataarray(path_elevation)
# with rasterio.open(path_elevation) as dataset_elev:
#     print(dataset_elev.crs)
# # reference system matches opt an mw data

# print(data_elevation.shape) # shape seems same as optical data

# plot:
# import matplotlib.pyplot as plt
# fig, axes = plt.subplots(ncols=1, figsize=(4,5))
# data_elevation.plot(ax=axes) # , cmap = cmap
# plt.draw()

In [5]:
def add_elevation(data):
    df = data.to_dataframe()
    df = df.reset_index()
    df = df[['x', 'y', 'band_data']]
    df.rename({'band_data': 'elevation_data'}, axis=1, inplace=True)
    return df

Distance from margin/shore

In [6]:
# add if coast column - if at least one na but not all 

Geohash

In [7]:
# convert x and y to useful sections of geodata

## Main:

In [8]:
def main(df_path, path_elevation, out_path):
    # load elevation data
    data_elevation = xarray.open_dataarray(path_elevation)
    # get files
    files = [f for f in listdir(df_path) if isfile(join(df_path, f))]

    # # TEMP 
    # files = [files[0]]

    # loop 
    for file in files: 
        melt_date =  file[5:15]
        
        # data load 
        df = pd.read_parquet(df_path + file)
        df = add_date(df)
        df = add_aggregated(df)
        df_elevation = add_elevation(data_elevation)

        # merge
        df_with_elevation = pd.merge(df, df_elevation, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt

        #print(df_with_elevation.shape) # (2278725, 18) or (752690, 18)
        # write:
        df_with_elevation.to_parquet(out_path + 'melt_' + melt_date + '_extended.parquet.gzip', index= False) 
    return

In [9]:
# paths
df_path = r"../Data/combined/to_pandas/"
out_path = r"../Data/combined/pandas_extended/"

# elevation data:
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"

main(df_path, path_elevation, out_path)