In [3]:
import sys
sys.path.append('..')

import yaml
from baseline.utilities import *

np.set_printoptions(suppress=True)

TARGET = 'UHI Index'

train_new = pd.read_parquet('../pipeline/data/processed/train/train_data.parquet')

feature_list = [c for c in yaml.safe_load(open('../pipeline/data/columns.yml', 'r'))['features'] if c not in [
    "bearing_bronx", "bearing_manhattan", 
    "wind_influence_15:00:00_bronx", "wind_influence_15:05:00_bronx", "wind_influence_15:10:00_bronx", "wind_influence_15:15:00_bronx", "wind_influence_15:20:00_bronx", "wind_influence_15:25:00_bronx", "wind_influence_15:30:00_bronx", "wind_influence_15:35:00_bronx", "wind_influence_15:40:00_bronx", "wind_influence_15:45:00_bronx", "wind_influence_15:50:00_bronx", "wind_influence_15:55:00_bronx", "wind_influence_16:00:00_bronx", "wind_influence_15:00:00_manhattan", "wind_influence_15:05:00_manhattan", "wind_influence_15:10:00_manhattan", "wind_influence_15:15:00_manhattan", "wind_influence_15:20:00_manhattan", "wind_influence_15:25:00_manhattan", "wind_influence_15:30:00_manhattan", "wind_influence_15:35:00_manhattan", "wind_influence_15:40:00_manhattan", "wind_influence_15:45:00_manhattan", "wind_influence_15:50:00_manhattan", "wind_influence_15:55:00_manhattan", "wind_influence_16:00:00_manhattan", 
    "pct_change_wind_influence_15:05:00_bronx", "pct_change_wind_influence_15:10:00_bronx", "pct_change_wind_influence_15:15:00_bronx", "pct_change_wind_influence_15:20:00_bronx", "pct_change_wind_influence_15:25:00_bronx", "pct_change_wind_influence_15:30:00_bronx", "pct_change_wind_influence_15:35:00_bronx", "pct_change_wind_influence_15:40:00_bronx", "pct_change_wind_influence_15:45:00_bronx", "pct_change_wind_influence_15:50:00_bronx", "pct_change_wind_influence_15:55:00_bronx", "pct_change_wind_influence_16:00:00_bronx", "pct_change_wind_influence_15:05:00_manhattan", "pct_change_wind_influence_15:10:00_manhattan", "pct_change_wind_influence_15:15:00_manhattan", "pct_change_wind_influence_15:20:00_manhattan", "pct_change_wind_influence_15:25:00_manhattan", "pct_change_wind_influence_15:30:00_manhattan", "pct_change_wind_influence_15:35:00_manhattan", "pct_change_wind_influence_15:40:00_manhattan", "pct_change_wind_influence_15:45:00_manhattan", "pct_change_wind_influence_15:50:00_manhattan", "pct_change_wind_influence_15:55:00_manhattan", "pct_change_wind_influence_16:00:00_manhattan"
]]

X = train_new.drop(columns=['Longitude', 'Latitude', 'datetime', TARGET])[feature_list]
y = train_new[TARGET]

print(f"Original column length -> {len(X.columns)=}")

Original column length -> len(X.columns)=1030


In [4]:
# -----------------------------------------------------------------------------
# Drop columns with 0 correlation with the target
# -----------------------------------------------------------------------------

correlations = train_new[feature_list + [TARGET]].corr()

correlations = correlations[TARGET].sort_values(ascending=False)

threshold = 0.05
columns_to_keep = correlations[abs(correlations) >= threshold].index

X = train_new[columns_to_keep].drop(columns=TARGET)
y = train_new[TARGET]
print(f"After correlation threshold removal -> {len(X.columns)=}")

After correlation threshold removal -> len(X.columns)=950


In [5]:
# -----------------------------------------------------------------------------
# Collinearity problem
# -----------------------------------------------------------------------------
from scipy.stats import pearsonr
import numpy as np

corr_matrix = X.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print(f"{to_drop=}")

X = X.drop(columns=to_drop)
print(f"After collinearity removal -> {len(X.columns)=}")
for col in X.columns:
    print(f"- {col}")

to_drop=['sntnl_buffer_band_5_900_std', 'sntnl_buffer_band_7_600_mean', 'sntnl_buffer_band_6_700_mean', 'sntnl_buffer_band_5_800_std', 'sntnl_buffer_band_6_800_mean', 'sntnl_buffer_band_6_900_mean', 'lndst_mean_lwir11_350m', 'lndst_mean_lwir11_300m', 'lndst_mean_lwir11_250m', 'lndst_mean_lwir11_400m', 'lndst_mean_lwir11_450m', 'lndst_mean_lwir11_500m', 'sntnl_buffer_band_6_1000_mean', 'lndst_mean_lwir11_600m', 'lndst_mean_lwir11_200m', 'sntnl_buffer_band_7_700_mean', 'lndst_mean_lwir11_700m', 'sntnl_buffer_band_5_700_std', 'sntnl_buffer_band_7_500_mean', 'sntnl_buffer_band_6_500_mean', 'sntnl_buffer_band_7_800_mean', 'lndst_mean_lwir11_150m', 'lndst_mean_lwir11_800m', 'sntnl_buffer_band_5_600_std', 'sntnl_buffer_band_2_1000_mean', 'sntnl_buffer_band_7_900_mean', 'sntnl_buffer_band_2_900_mean', 'sntnl_buffer_band_7_450_mean', 'sntnl_buffer_band_2_800_mean', 'sntnl_buffer_band_6_450_mean', 'sntnl_buffer_band_2_700_mean', 'sntnl_buffer_band_5_1000_mean', 'sntnl_buffer_band_2_600_mean', 'l

In [6]:
# -----------------------------------------------------------------------------
# RFECV with RandomForest
# -----------------------------------------------------------------------------

from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

rfe_fs = RFECV(estimator=ExtraTreesRegressor(random_state=SEED),    # RandomForestRegressor(oob_score=True, random_state=SEED), 
               cv=KFold(n_splits=10, shuffle=True, random_state=SEED), 
               scoring='r2', step=2, n_jobs=-1, verbose=1)
X_selected = rfe_fs.fit_transform(X, y)

print(f"{rfe_fs.ranking_=}")
try:
    print(f"{rfe_fs.cv_results_.keys()=}")
    idxmx = np.argmax(rfe_fs.cv_results_['mean_test_score'])
    print(f" -> {rfe_fs.cv_results_['mean_test_score'][idxmx]} +/- {rfe_fs.cv_results_['std_test_score'][idxmx]}")
    print(f"Best number of features: {rfe_fs.cv_results_['n_features'][idxmx]}")

    plt.figure()
    # plt.plot(rfe_fs.cv_results_['n_features'][2:], rfe_fs.cv_results_['mean_test_score'][2:], color='blue')
    plt.errorbar(
        x=rfe_fs.cv_results_["n_features"][2:],
        y=rfe_fs.cv_results_["mean_test_score"][2:],
        yerr=rfe_fs.cv_results_["std_test_score"][2:],
    )
    # plt.xscale('log')
    # plt.yscale('log')
    plt.xlabel('Number of Features Selected')
    plt.ylabel('Cross-Validation Score (R2)')
    plt.title('RFECV - Number of Features vs. Cross-Validation Score')
    plt.show()
except:
    pass

X_rfe = X.loc[:, rfe_fs.support_]

print(X_selected.var(axis=0))
# display(X_rfe.head())
print(X_rfe.columns)

print(f"After RFECV Feature Selection-> {X_selected.shape[1]=}")
X_rfe.to_parquet('../pipeline/data/processed/train/X_selected.parquet')
pd.DataFrame(y).to_parquet('../pipeline/data/processed/train/y_selected.parquet')
print(f"Saved selected features to parquet files")

Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.
Fitting estimator with 142 features.


KeyboardInterrupt: 