In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold, cross_val_score
import shap
import glob
import matplotlib.patches as mpatches
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import statistics as stats
from sklearn.preprocessing import MinMaxScaler

In [2]:
#load data
df = pd.read_csv('data/df_shadow_econ.csv')

In [7]:
#set models
level0 = list()
level0.append(('RF', RandomForestRegressor(bootstrap=False, criterion='poisson', max_depth=None, max_features='sqrt', 
                                           min_samples_leaf=1, min_samples_split=2, n_estimators=300)))
level0.append(('GB', GradientBoostingRegressor()))
level1 = LinearRegression(copy_X=True, fit_intercept=True)
models = [
    LinearRegression(copy_X=True, fit_intercept=True),
    Lasso(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=1000),
    Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000),
    ElasticNet(alpha=0.001, fit_intercept=True, l1_ratio=0.75, max_iter=1000, tol=0.0001),
    RandomForestRegressor(bootstrap=False, criterion='poisson', max_depth=None, max_features='sqrt', min_samples_leaf=1, 
                          min_samples_split=2, n_estimators=300),
    SVR(C=10.0, epsilon=0.1, gamma=10.0, kernel='rbf'),
    XGBRegressor(colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=300, subsample=1.0),
    LGBMRegressor(learning_rate=0.1, max_depth=10, n_estimators=300, num_leaves=30,verbosity=-1),
    CatBoostRegressor(depth=10, iterations=300, learning_rate=0.1, silent=True),
    BaggingRegressor(bootstrap=False, max_features=0.9, max_samples=0.9, n_estimators=100),
    StackingRegressor(estimators=level0, final_estimator=level1),
    ExtraTreesRegressor(bootstrap=False, max_features=0.35, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
]

models_names = ['LR','Lasso','Ridge','ElasticNet','RF','SVR','XB','LGBM','CatBoost','Bagging','Stacking','ETR']

In [8]:
#prepare data
df_train, df_test = train_test_split(df, random_state=0, test_size=0.2)
X = df_train.copy()
X = X.drop(['Shadow Economy'], axis=1)
y = df_train['Shadow Economy']
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

In [9]:
#scaling of independent variables
scaler = MinMaxScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))

In [10]:
%%capture
metrics = pd.DataFrame(columns=['Model','Metric','Mean','Std. dev.'])
for k in range (0,len(models)):
    model = models[k]
    mape = list()
    adjr2 = list()
    rmse = list()
    for train_index, test_index in cv.split(X):
        train_x, test_x = X.iloc[train_index], X.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]
        model.fit(train_x, train_y)
        pred_y = model.predict(test_x)
        mape.append(mean_absolute_percentage_error(test_y, pred_y))
        g = mean_squared_error(test_y, pred_y)
        rmse.append(g**0.5)
        adj_r2 = 1 - (1-r2_score(test_y, pred_y)) * (len(test_y)-1)/(len(test_y)-test_x.shape[1]-1)
        adjr2.append(adj_r2)
    row = pd.DataFrame([[models_names[k],'MAPE', stats.mean(mape), stats.stdev(mape)]], columns=metrics.columns)
    metrics = pd.concat([metrics, row], ignore_index=True)
    row = pd.DataFrame([[models_names[k],'RMSE', stats.mean(rmse), stats.stdev(rmse)]], columns=metrics.columns)
    metrics = pd.concat([metrics, row], ignore_index=True)
    row = pd.DataFrame([[models_names[k],'adjR2', stats.mean(adjr2), stats.stdev(adjr2)]], columns=metrics.columns)
    metrics = pd.concat([metrics, row], ignore_index=True)

In [12]:
#save data
metrics.to_csv('data/metrics.csv')

In [13]:
%%capture
#performance - oos dataset
X_test = df_test.copy()
X_test = X_test.drop(['Shadow Economy'], axis=1)
y_test = df_test['Shadow Economy']
scaler.fit(X_test)
X_test = pd.DataFrame(scaler.transform(X_test))
metrics_oos = pd.DataFrame(columns=['Model','Metric','Value'])
for k in range (0,len(models)):
    model = models[k]
    model.fit(X, y)
    pred_y = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, pred_y)
    rmse = mean_squared_error(y_test, pred_y)
    adj_r2 = 1 - (1-r2_score(y_test, pred_y)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
    row = pd.DataFrame([[models_names[k],'MAPE', mape]], columns=metrics_oos.columns)
    metrics_oos = pd.concat([metrics_oos, row], ignore_index=True)
    
    row = pd.DataFrame([[models_names[k],'RMSE', rmse**0.5]], columns=metrics_oos.columns)
    metrics_oos = pd.concat([metrics_oos, row], ignore_index=True)
    
    row = pd.DataFrame([[models_names[k],'adjR2', adj_r2]], columns=metrics_oos.columns)
    metrics_oos = pd.concat([metrics_oos, row], ignore_index=True)

In [15]:
#save data
metrics_oos.to_csv('data/metrics_oos.csv')

In [9]:
df = pd.read_csv('data/metrics_oos.csv')
del df['Unnamed: 0']
df = df.loc[df['Metric']=='adjR2']
df.round(2)

Unnamed: 0,Model,Metric,Value
2,LR,adjR2,0.46
5,Lasso,adjR2,0.46
8,Ridge,adjR2,0.47
11,ElasticNet,adjR2,0.48
14,RF,adjR2,0.86
17,SVR,adjR2,0.78
20,XB,adjR2,0.85
23,LGBM,adjR2,0.87
26,CatBoost,adjR2,0.87
29,Bagging,adjR2,0.85
