<a href="https://colab.research.google.com/github/jayashree-codes/Applied_Machine_Learning_Species/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Feature Engineering - adding climate trend features - Train Dataset
import pandas as pd
import numpy as np

excel_path = "/Users/jayashreehariharan/Desktop/AML_Project/Species-2/species/Merged_train_data.xlsx"
sheet_name = "merged_data"
df = pd.read_excel(excel_path, sheet_name=sheet_name)

tavg_cols = ['tavg_2020_mean','tavg_2021_mean','tavg_2022_mean','tavg_2023_mean','tavg_2024_mean']
prec_cols = ['prec_2020_mean','prec_2021_mean','prec_2022_mean','prec_2023_mean','prec_2024_mean']

years = np.array([2020, 2021, 2022, 2023, 2024])

tavg_values = df[tavg_cols].values

tavg_trend = np.polyfit(years, tavg_values.T, 1)[0]

def compute_slope(values, years):
    x = years
    x_mean = x.mean()
    x_diff2 = np.sum((x - x_mean)**2)
    slopes = np.sum((values - values.mean(axis=1, keepdims=True)) * (x - x_mean), axis=1) / x_diff2
    return slopes

df['tavg_trend'] = compute_slope(tavg_values, years)
df['prec_trend'] = compute_slope(df[prec_cols].values, years)

df['tavg_std'] = tavg_values.std(axis=1)
df['prec_std'] = df[prec_cols].values.std(axis=1)

df['tavg_max'] = tavg_values.max(axis=1)
df['tavg_min'] = tavg_values.min(axis=1)
df['tavg_range'] = df['tavg_max'] - df['tavg_min']

df['prec_max'] = df[prec_cols].values.max(axis=1)
df['prec_min'] = df[prec_cols].values.min(axis=1)
df['prec_range'] = df['prec_max'] - df['prec_min']

df['tavg_cv'] = df['tavg_std'] / df['tavg_mean'] if 'tavg_mean' in df.columns else df['tavg_std'] / tavg_values.mean(axis=1)
df['prec_cv'] = df['prec_std'] / df['prec_mean'] if 'prec_mean' in df.columns else df['prec_std'] / df[prec_cols].values.mean(axis=1)

with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
#Feature Engineering - adding climate trend features - Test Dataset
import pandas as pd
import numpy as np

excel_path = "/Users/jayashreehariharan/Desktop/AML_Project/Species-2/species/species_test.xlsx"
sheet_name = "test_locs"
df = pd.read_excel(excel_path, sheet_name=sheet_name)

tavg_cols = ['tavg_2020_mean','tavg_2021_mean','tavg_2022_mean','tavg_2023_mean','tavg_2024_mean']
prec_cols = ['prec_2020_mean','prec_2021_mean','prec_2022_mean','prec_2023_mean','prec_2024_mean']

years = np.array([2020, 2021, 2022, 2023, 2024])

tavg_values = df[tavg_cols].values

tavg_trend = np.polyfit(years, tavg_values.T, 1)[0]

def compute_slope(values, years):
    x = years
    x_mean = x.mean()
    x_diff2 = np.sum((x - x_mean)**2)
    slopes = np.sum((values - values.mean(axis=1, keepdims=True)) * (x - x_mean), axis=1) / x_diff2
    return slopes

df['tavg_trend'] = compute_slope(tavg_values, years)
df['prec_trend'] = compute_slope(df[prec_cols].values, years)

df['tavg_std'] = tavg_values.std(axis=1)
df['prec_std'] = df[prec_cols].values.std(axis=1)

df['tavg_max'] = tavg_values.max(axis=1)
df['tavg_min'] = tavg_values.min(axis=1)
df['tavg_range'] = df['tavg_max'] - df['tavg_min']

df['prec_max'] = df[prec_cols].values.max(axis=1)
df['prec_min'] = df[prec_cols].values.min(axis=1)
df['prec_range'] = df['prec_max'] - df['prec_min']

df['tavg_cv'] = df['tavg_std'] / df['tavg_mean'] if 'tavg_mean' in df.columns else df['tavg_std'] / tavg_values.mean(axis=1)
df['prec_cv'] = df['prec_std'] / df['prec_mean'] if 'prec_mean' in df.columns else df['prec_std'] / df[prec_cols].values.mean(axis=1)

with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

In [3]:
#Feature Engineering - encoding spatial continuity - Train Dataset
import pandas as pd
import numpy as np
from scipy.stats import linregress

excel_path = "/Users/jayashreehariharan/Desktop/AML_Project/Species-2/species/Merged_train_data.xlsx"
sheet_name = "merged_data"
df = pd.read_excel(excel_path, sheet_name=sheet_name)

df['lat_rad'] = np.radians(df['latitude'])
df['lon_rad'] = np.radians(df['longitude'])

df['lat_sin'] = np.sin(df['lat_rad'])
df['lat_cos'] = np.cos(df['lat_rad'])
df['lon_sin'] = np.sin(df['lon_rad'])
df['lon_cos'] = np.cos(df['lon_rad'])

df.drop(['lat_rad','lon_rad'], axis=1, inplace=True)

with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)

In [4]:
#Feature Engineering - encoding spatial continuity - Test Dataset
import pandas as pd
import numpy as np
from scipy.stats import linregress

excel_path = "/Users/jayashreehariharan/Desktop/AML_Project/Species-2/species/species_test.xlsx"
sheet_name = "test_locs"
df = pd.read_excel(excel_path, sheet_name=sheet_name)

df['lat_rad'] = np.radians(df['latitude'])
df['lon_rad'] = np.radians(df['longitude'])

df['lat_sin'] = np.sin(df['lat_rad'])
df['lat_cos'] = np.cos(df['lat_rad'])
df['lon_sin'] = np.sin(df['lon_rad'])
df['lon_cos'] = np.cos(df['lon_rad'])

df.drop(['lat_rad','lon_rad'], axis=1, inplace=True)

with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df.to_excel(writer, sheet_name=sheet_name, index=False)