In [2]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

DF_MAIN = 'data/df_main_mod_noretires.csv'

pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 160)

# Machine learning
The model used is RandomForestRegressor form sklearn package. Training is being done with 5-folded cross-validation. (which is a default number of folds if cv is applied).<br>

## Data preparation

We get rid of indexes and separate the target from the features.

In [3]:
df = pd.read_csv(DF_MAIN)
df = df.drop(['ID_region', 'ID_subregion', 'ID_powiatu', 'year'], axis=1)
y = df['p3350-stopień-wykorzystania-miejsc-noclegowych-w-kolejnym-roku']
X = df.drop(['p3350-stopień-wykorzystania-miejsc-noclegowych-w-kolejnym-roku'], axis=1).copy()

## Mocking run

We choose to validate our regression model according to MAE (Mean absolute error) score. We might try to construct a default model to estimate a baseline score for further improvement.

In [4]:
model = RandomForestRegressor(random_state=0)
model_base_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')

In [5]:
model_base_scores.mean()

-6.774514246575343

## Feature selection and validation

We might want to get rid of highly correlated features for the sake of performance. We'll use principal component analysis decomposition method, which is known to work better when feed with normalised data, which we obtain with help of the StandardScaler object. We introduce a pipeline to make things more convinient and perform a grid search for the optimal amount of components.

In [12]:
pipe = Pipeline([('normalisation', StandardScaler(with_mean=False)),
                 ('pca', PCA()),
                 ('rfr', RandomForestRegressor())])
grid = {'pca__n_components'      : range(24, 40, 1),
        'rfr__n_estimators'      : [150],
        'rfr__max_depth'         : range(1,20,1)}
best_fit_model = GridSearchCV(pipe, param_grid = grid, scoring='neg_mean_absolute_error')

In [None]:
best_fit_model.fit(X, y)

In [11]:
print(best_fit_model.best_params_, '\n', best_fit_model.best_score_)

{'pca__n_components': 26, 'rfr__max_depth': 3, 'rfr__n_estimators': 150} 
 -7.118056326951219
