In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

from scipy.stats import spearmanr, pearsonr

from project_module import regression_report
from project_module.feature_selection import SelectKBestByCoefficient

In [2]:
# load data
x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')
x_test = np.load('x_test.npy')
y_test = np.load('y_test.npy')

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1095, 89) (1095,) (365, 89) (365,)


In [3]:
def get_RF(params: dict) -> RandomForestRegressor:
    RF = RandomForestRegressor(**params)
    return RF

best_params = {
    'n_estimators': 68, 'min_samples_split': 2, 
    'min_impurity_decrease': 0.8304579924331754, 
    'max_depth': 16, 'criterion': 'mae', 
    'ccp_alpha': 2.3855551842621, 'n_jobs' : 4, 
    'warm_start': False
}

In [4]:
RF = get_RF(best_params)
RF.fit(x_train, y_train)

pred = RF.predict(x_test)
regression_report(y_test, pred)

mse = 1019625970.1458
mae = 17843.8148
rmse = 31931.5826
mape = 0.1015


## [Filter method](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest)
1. [Pearson coefficient](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html)
2. [Spearman coefficient](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html)

In [5]:
# pearson coefficient
pearson_selector = SelectKBestByCoefficient(k = 30, method = 'pearson').fit(x_train, y_train)
x_train_new = pearson_selector.transform(x_train)
x_test_new = pearson_selector.transform(x_test)

RF = get_RF(best_params)
RF.fit(x_train_new, y_train)

pred = RF.predict(x_test_new)

regression_report(y_test, pred)

mse = 995214990.7529
mae = 18384.7244
rmse = 31547.0282
mape = 0.1038


In [6]:
# Spearman coefficient
spearman_selector = SelectKBestByCoefficient(k = 30, method = 'spearman').fit(x_train, y_train)
x_train_new = spearman_selector.transform(x_train)
x_test_new = spearman_selector.transform(x_test)

RF = get_RF(best_params)
RF.fit(x_train_new, y_train)

pred = RF.predict(x_test_new)

regression_report(y_test, pred)

mse = 1055096706.8959
mae = 18188.4794
rmse = 32482.2522
mape = 0.1037


In [7]:
# Anova for regression
anova_selector = SelectKBest(score_func = f_regression, k = 30).fit(x_train, y_train)
x_train_new = anova_selector.transform(x_train)
x_test_new = anova_selector.transform(x_test)

RF = get_RF(best_params)
RF.fit(x_train_new, y_train)

pred = RF.predict(x_test_new)

regression_report(y_test, pred)

mse = 1286149684.8210
mae = 18814.8145
rmse = 35862.9291
mape = 0.1036
