In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
from bisect import bisect

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [6]:
def prepare_data(df):
    
    df['Euclidean_distance_to_hydro'] = (df.Vertical_Distance_To_Hydrology**2 + df.Horizontal_Distance_To_Hydrology**2)**.5
    
    cols = ['Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points',
            'Euclidean_distance_to_hydro']
    
    df['distance_mean']       = df[cols].mean(axis=1)
    df['distance_sum']        = df[cols].sum(axis=1)
    df['distance_road_fire']  = df[cols[:2]].mean(axis=1)
    df['distance_hydro_fire'] = df[cols[1:]].mean(axis=1)
    df['distance_road_hydro'] = df[[cols[0], cols[2]]].mean(axis=1)
    
    df['distance_sum_road_fire']  = df[cols[:2]].sum(axis=1)
    df['distance_sum_hydro_fire'] = df[cols[1:]].sum(axis=1)
    df['distance_sum_road_hydro'] = df[[cols[0], cols[2]]].sum(axis=1)
    
    df['distance_dif_road_fire']  = df[cols[0]] - df[cols[1]]
    df['distance_dif_hydro_road'] = df[cols[2]] - df[cols[0]]
    df['distance_dif_hydro_fire'] = df[cols[2]] - df[cols[1]]
    
    SHADES = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    
    df['shade_noon_diff'] = df['Hillshade_9am'] - df['Hillshade_Noon']
    df['shade_3pm_diff']  = df['Hillshade_Noon'] - df['Hillshade_3pm']
    df['shade_all_diff']  = df['Hillshade_9am'] - df['Hillshade_3pm']
    df['shade_sum']       = df[SHADES].sum(axis=1)
    df['shade_mean']      = df[SHADES].mean(axis=1)
    
    df['ElevationHydro'] = df['Elevation'] - 0.25 * df['Euclidean_distance_to_hydro']
    
    df['ElevationV'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    
    df['ElevationH'] = df['Elevation'] - 0.19 * df['Horizontal_Distance_To_Hydrology']
    
    df['Elevation2'] = df['Elevation']**2
    df['ElevationLog'] = np.log1p(df['Elevation'])
    
    df['Aspect_sin'] = np.sin(np.radians(df.Aspect))
    df['Aspect_cos'] = np.cos(np.radians(df.Aspect))
    df['Slope_sin'] = np.sin(np.radians(df.Slope))  
    df['Slope_cos'] = np.cos(np.radians(df.Slope))
    
    cardinals = [i for i in range(45, 361, 90)]
    points = ['N', 'E', 'S', 'W']
    
    df['Cardinal'] = df.Aspect.apply(lambda x: points[bisect(cardinals, x) % 4])
    
    d = {'N': 0, 'E': 1, 'S': 0, 'W':-1}
    df['Cardinal'] = df.Cardinal.apply(lambda x: d[x])

    return df

In [11]:
input_data = Path('../data/input/')
interim_data = Path('../data/interim/')
train_file = Path('train.csv')
train_output_file = Path('train_feature_engineering_v1.csv')
test_file = Path('test.csv')
test_output_file = Path('test_feature_engineering_v1.csv')

In [8]:
train = pd.read_csv(input_data/train_file, index_col=0)
test = pd.read_csv(input_data/test_file, index_col=0)

In [9]:
train = prepare_data(train)
test = prepare_data(test)

In [12]:
train.to_csv(interim_data/train_output_file)
test.to_csv(interim_data/test_output_file)