In [36]:
import sklearn
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [38]:
PENGUIN_DATASET = '../data/penguin/penguins_final_with_era5.csv'
penguin_df = pd.read_csv(PENGUIN_DATASET)

penguin_df.columns

Index(['Unnamed: 0', 'track_id', 'date_gmt', 'latitude_mean', 'longitude_mean',
       'lat_colony_mean', 'lon_colony_mean', 'km_to_colony_mean',
       'km_since_last_measure_mean', 'delta_km_north_mean',
       'delta_km_south_mean', 'delta_km_east_mean', 'delta_km_west_mean',
       'minutes_since_last_measure_mean', 'latitude_std', 'longitude_std',
       'lat_colony_std', 'lon_colony_std', 'km_to_colony_std',
       'km_since_last_measure_std', 'delta_km_north_std', 'delta_km_south_std',
       'delta_km_east_std', 'delta_km_west_std',
       'minutes_since_last_measure_std', 'latitude_min', 'longitude_min',
       'lat_colony_min', 'lon_colony_min', 'km_to_colony_min',
       'km_since_last_measure_min', 'delta_km_north_min', 'delta_km_south_min',
       'delta_km_east_min', 'delta_km_west_min',
       'minutes_since_last_measure_min', 'latitude_max', 'longitude_max',
       'lat_colony_max', 'lon_colony_max', 'km_to_colony_max',
       'km_since_last_measure_max', 'delta_km_nort

In [39]:
# filter df for adelie penguins and king george island
df = penguin_df[(penguin_df['common_name'] == 'Adelie Penguin') & (penguin_df['colony_name'] == 'King George Island')]
#df = penguin_df[(penguin_df['common_name'] == 'Adelie Penguin')]

In [14]:
df.columns

Index(['Unnamed: 0', 'track_id', 'date_gmt', 'latitude_mean', 'longitude_mean',
       'lat_colony_mean', 'lon_colony_mean', 'km_to_colony_mean',
       'km_since_last_measure_mean', 'delta_km_north_mean',
       'delta_km_south_mean', 'delta_km_east_mean', 'delta_km_west_mean',
       'minutes_since_last_measure_mean', 'latitude_std', 'longitude_std',
       'lat_colony_std', 'lon_colony_std', 'km_to_colony_std',
       'km_since_last_measure_std', 'delta_km_north_std', 'delta_km_south_std',
       'delta_km_east_std', 'delta_km_west_std',
       'minutes_since_last_measure_std', 'latitude_min', 'longitude_min',
       'lat_colony_min', 'lon_colony_min', 'km_to_colony_min',
       'km_since_last_measure_min', 'delta_km_north_min', 'delta_km_south_min',
       'delta_km_east_min', 'delta_km_west_min',
       'minutes_since_last_measure_min', 'latitude_max', 'longitude_max',
       'lat_colony_max', 'lon_colony_max', 'km_to_colony_max',
       'km_since_last_measure_max', 'delta_km_nort

In [40]:
df['date_gmt'] = pd.to_datetime(df['date_gmt'])
df = df.sort_values(by='date_gmt')

# Feature Engineering
df['day_of_year'] = df['date_gmt'].dt.dayofyear
df['week_of_year'] = df['date_gmt'].dt.isocalendar().week
df['month'] = df['date_gmt'].dt.month

# Create lag features for 'tp' and 't2m'
for lag in range(1, 4):
    df[f'tp_lag_{lag}'] = df['tp'].shift(lag)
    df[f't2m_lag_{lag}'] = df['t2m'].shift(lag)

# Cyclical features for temporal variables
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.0)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.0)
df['week_of_year_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52.0)
df['week_of_year_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52.0)

# Add lag features for additional variables
for lag in range(1, 4):
    df[f'sst_lag_{lag}'] = df['sst'].shift(lag)
    df[f'siconc_lag_{lag}'] = df['siconc'].shift(lag)
    df[f'sd_lag_{lag}'] = df['sd'].shift(lag)
    df[f'rsn_lag_{lag}'] = df['rsn'].shift(lag)
    df[f'avg_smr_lag_{lag}'] = df['avg_smr'].shift(lag)

# Rolling averages and cumulative sums for additional variables
df['sst_rolling_7'] = df['sst'].rolling(window=7, min_periods=1).mean()
df['siconc_rolling_7'] = df['siconc'].rolling(window=7, min_periods=1).mean()
df['sd_cumsum'] = df['sd'].cumsum()
df['rsn_cumsum'] = df['rsn'].cumsum()

# Drop rows with missing values
df = df.dropna()

# Define feature set and target variable
features = [
    'tp', 't2m', 'day_of_year_sin', 'day_of_year_cos',
    'week_of_year_sin', 'week_of_year_cos', 'sst', 'siconc', 
    'tp_lag_1', 'tp_lag_2', 'tp_lag_3',
    't2m_lag_1', 't2m_lag_2', 't2m_lag_3', 'sst_lag_1', 'sst_lag_2', 
    'sst_lag_3', 'siconc_lag_1', 'siconc_lag_2', 'siconc_lag_3'
]
target = 'km_to_colony_mean'

X = df[features]
y = df[target]

# Standardize features
scaler = StandardScaler()
scaled_features = ['sst', 'siconc', 'tp', 't2m']
#X[scaled_features] = scaler.fit_transform(X[scaled_features])
#X_scaled = scaler.fit_transform(X)

# Ridge and Lasso models
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Lists to store evaluation metrics
ridge_r2_scores = []
ridge_rmse_scores = []
lasso_r2_scores = []
lasso_rmse_scores = []

# Model training and evaluation
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Ridge Regression
    ridge_model.fit(X_train, y_train)
    ridge_y_pred = ridge_model.predict(X_test)
    ridge_r2_scores.append(r2_score(y_test, ridge_y_pred))
    ridge_rmse_scores.append(np.sqrt(mean_squared_error(y_test, ridge_y_pred)))

    # Lasso Regression
    lasso_model.fit(X_train, y_train)
    lasso_y_pred = lasso_model.predict(X_test)
    lasso_r2_scores.append(r2_score(y_test, lasso_y_pred))
    lasso_rmse_scores.append(np.sqrt(mean_squared_error(y_test, lasso_y_pred)))

# Print results
print(f"Ridge Average R^2 Score: {np.mean(ridge_r2_scores):.3f}")
print(f"Ridge Average RMSE: {np.mean(ridge_rmse_scores):.3f}")
print(f"Lasso Average R^2 Score: {np.mean(lasso_r2_scores):.3f}")
print(f"Lasso Average RMSE: {np.mean(lasso_rmse_scores):.3f}")

# Save and print model coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Ridge Coef': ridge_model.coef_,
    'Lasso Coef': lasso_model.coef_
})
#print(coefficients)

coefficients['Ridge Importance'] = coefficients['Ridge Coef'].abs()
coefficients['Lasso Importance'] = coefficients['Lasso Coef'].abs()

# Sort by importance (descending order)
ridge_sorted = coefficients.sort_values(by='Ridge Importance', ascending=False)
lasso_sorted = coefficients.sort_values(by='Lasso Importance', ascending=False)

# Print sorted coefficients
print("Ridge Coefficients Sorted by Importance:")
print(ridge_sorted[['Feature', 'Ridge Coef', 'Ridge Importance']])

print("\nLasso Coefficients Sorted by Importance:")
print(lasso_sorted[['Feature', 'Lasso Coef', 'Lasso Importance']])

Ridge Average R^2 Score: 0.514
Ridge Average RMSE: 71.107
Lasso Average R^2 Score: 0.520
Lasso Average RMSE: 70.840
Ridge Coefficients Sorted by Importance:
             Feature  Ridge Coef  Ridge Importance
7             siconc  184.139353        184.139353
3    day_of_year_cos  136.326006        136.326006
4   week_of_year_sin   89.486006         89.486006
6                sst  -67.053640         67.053640
5   week_of_year_cos   57.084609         57.084609
1                t2m  -36.731634         36.731634
19      siconc_lag_3   26.205986         26.205986
18      siconc_lag_2   23.283406         23.283406
13         t2m_lag_3   19.167972         19.167972
11         t2m_lag_1   16.383206         16.383206
10          tp_lag_3  -13.406884         13.406884
9           tp_lag_2  -12.478428         12.478428
12         t2m_lag_2   11.234550         11.234550
14         sst_lag_1  -10.912839         10.912839
16         sst_lag_3  -10.315583         10.315583
8           tp_lag_1   -9.3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_gmt'] = pd.to_datetime(df['date_gmt'])


In [41]:
rf_model = RandomForestRegressor(
    n_estimators=200,  # Number of trees
    max_depth=20,      # Maximum depth of trees
    random_state=42    # Reproducibility
)

# To store evaluation metrics
rf_r2_scores = []
rf_rmse_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train Random Forest
    rf_model.fit(X_train, y_train)
    rf_y_pred = rf_model.predict(X_test)
    
    # Evaluate Random Forest
    rf_r2_scores.append(r2_score(y_test, rf_y_pred))
    rf_rmse_scores.append(np.sqrt(mean_squared_error(y_test, rf_y_pred)))

# Print Random Forest results
print(f"Random Forest Average R^2 Score: {np.mean(rf_r2_scores):.3f}")
print(f"Random Forest Average RMSE: {np.mean(rf_rmse_scores):.3f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_,
}).sort_values(by='Importance', ascending=False)
print(feature_importance)


Random Forest Average R^2 Score: 0.584
Random Forest Average RMSE: 65.839
             Feature  Importance
7             siconc    0.808389
2    day_of_year_sin    0.109237
6                sst    0.028842
3    day_of_year_cos    0.010010
1                t2m    0.008970
0                 tp    0.005244
14         sst_lag_1    0.003037
15         sst_lag_2    0.002878
12         t2m_lag_2    0.002564
11         t2m_lag_1    0.002464
10          tp_lag_3    0.002368
8           tp_lag_1    0.002163
16         sst_lag_3    0.002150
4   week_of_year_sin    0.001980
17      siconc_lag_1    0.001951
13         t2m_lag_3    0.001867
9           tp_lag_2    0.001839
19      siconc_lag_3    0.001676
18      siconc_lag_2    0.001662
5   week_of_year_cos    0.000709


In [24]:
xgb_model = XGBRegressor(
    n_estimators=100,  # Number of trees
    max_depth=6,       # Maximum depth of each tree
    learning_rate=0.1, # Learning rate
    random_state=42    # Seed for reproducibility
)

# To store evaluation metrics
xgb_r2_scores = []
xgb_rmse_scores = []

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train XGBoost model
    xgb_model.fit(X_train, y_train)
    
    # Predictions
    y_pred = xgb_model.predict(X_test)
    
    # Evaluate model
    xgb_r2_scores.append(r2_score(y_test, y_pred))
    xgb_rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

# Print XGBoost results
print(f"XGBoost Average R^2 Score: {np.mean(xgb_r2_scores):.3f}")
print(f"XGBoost Average RMSE: {np.mean(xgb_rmse_scores):.3f}")

XGBoost Average R^2 Score: 0.067
XGBoost Average RMSE: 198.498
