In [None]:
import time
import sys
sys.path.insert(0, '../../../src/helper')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

import zbp_visualizer

In [None]:
SEED = 42

# DATA INGESTION

In [None]:
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)
data.head()

# CUSTOM TSCV

In [None]:
class CustomTimeSeriesSplit:
    def __init__(self, n_splits=None):
        self.n_splits = n_splits

    def split(self, X, y=None, groups=None):
        
        year_range = np.sort(X['year'].unique())
        min_year = year_range[0]
        
        self.n_splits = len(year_range) - 1
        
        for test_year in year_range[1:]:
            curr_range = np.arange(min_year, test_year)
            train = X[X['year'].apply(lambda year: year in curr_range)].index.to_numpy()
            test = X[X['year'] == test_year].index.to_numpy()
            
            yield train, test

    def get_n_splits(self, X, y, groups=None):
        
        year_range = np.sort(X['year'].unique())
        
        return len(year_range) - 1

# TRAIN-TEST SPLIT

In [None]:
tscv = CustomTimeSeriesSplit()
split = list(tscv.split(data, None))

train_indicies, test_indicies = split[-1]

data_train = data.iloc[train_indicies].reset_index(drop=True)
data_test = data.iloc[test_indicies].reset_index(drop=True)


# MODEL

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('reg', RandomForestRegressor(random_state=SEED, n_jobs=-1))])

In [None]:
%%time

included_feats = data.columns.drop(['est', 'emp_nf', 'qp1_nf', 'ap_nf'])
# included_feats = ['zip', 'year', 'naics_62_pct', 'total_retirement', 'total_population', 'naics_81_pct', 'ap', 'n1_4_pct', 'naics_23_pct', 'naics_54_pct']

param_grid = {'reg__n_estimators': [50],
              'reg__max_depth': [50]}

grid_search = GridSearchCV(estimator = pl, param_grid = param_grid, cv = CustomTimeSeriesSplit(), scoring = 'neg_root_mean_squared_error', n_jobs = -1)

X_train = data_train[included_feats]
y_train = data_train['est']

grid_search.fit(X_train, y_train)

display(grid_search.best_params_)
display(grid_search.best_score_)

# TESTING

In [None]:
X_test = data_test[included_feats]
y_test = data_test['est']

In [None]:
reg = grid_search.best_estimator_

y_preds = reg.predict(X_train)
display(mean_squared_error(y_train, y_preds))

y_preds = reg.predict(X_test)
display(mean_squared_error(y_test, y_preds))

# FEATURE IMPORTANCE - BY MEAN DECREASE IN IMPURITY

In [None]:
top_x = 11

preproc.fit(X_train)
feature_names = preproc.get_feature_names_out()
feature_names = np.char.replace(feature_names.astype('str'), 'onehots__','')
feature_names = np.char.replace(feature_names, 'remainder__','')


importances = reg['reg'].feature_importances_
std = np.std([tree.feature_importances_ for tree in reg['reg'].estimators_], axis=0)

forest_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)[1:top_x].sort_values(ascending=True)
display(forest_importances.index.to_numpy())

fig, ax = plt.subplots()
forest_importances.plot(kind='barh', ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_xlabel("Mean decrease in impurity")

# VISUALIZE PREDICTIONS

In [None]:
# VISUALIZATION CODE NONFUNCTION FROM NOTEBOOK ONLY WORKS FROM RUN.PY
# MUST ADJUST METHOD TO TAKE IN OUTPATH TO FUNCTION HERE

# preds = X_test.copy()
# preds['est_pred'] = pl.predict(X_test)
# last_year = preds['year'].max()
# preds_last_year = preds[preds['year'] == last_year][['zip','est_pred']]
# zbp_visualizer.generate_zbp_chloropleth(preds_last_year, 'zip', 'est_pred', f'rf_reg_{last_year}_preds')