In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os 

from data_cleanup import *
from feature_selection import *
from model_ import *

from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as rmse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, LassoLarsCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
corr_cols = ['ti_cpi', 'bci_bci', 'ti_cpi_om', 'wbgi_cce']
meta_cols = ['ccode', 'ccode_qog', 'ccodealp', 'ccodealp_year', 'ccodecow', 'cname', 'cname_qog', 'cname_year', 'version', 'year', 'region', 'sub-region']
df = load_reduced_df(corr_cols)

In [None]:
display(df.shape)

### Feature selection

In [None]:
df = drop_date_columns(df)

best_features_dict = {}
selected_features_dict = {}

for target_col in corr_cols:
    X_train, X_test, y_train, y_test = create_traintestsplit(df, corr_cols = corr_cols, meta_cols=meta_cols, target_col=target_col)
    
    best_features = pre_select(X_train, y_train)
    best_features_dict[target_col] = set(best_features)
    df_train = X_train[best_features].copy()
    df_train[target_col]=y_train
    mce = MultiCollinearityEliminator(df_train, target_col, 0.85)
    feaures_no_collinearity = list(mce.autoEliminateMulticollinearity().columns)
    feaures_no_collinearity.remove(target_col)
    selected_features_dict[target_col] = set(feaures_no_collinearity)


#selected_features_dict

In [None]:
best_features_union=list(set.union(*list(best_features_dict.values())))
best_features_intersection=list(set.intersection(*list(best_features_dict.values())))

best_features_intersection

print(df[best_features_union].isna().sum().sort_values(ascending=False))


In [None]:
selected_features_union=list(set.union(*list(selected_features_dict.values())))
selected_features_intersection=list(set.intersection(*list(selected_features_dict.values())))

selected_features_intersection

## Modeling

Try Lasso and Random Forest next. Train models for different feature configurations 

    - individual selected features for a particular index
    - union of all good features for all indices
    - intersection of all selected for features for all indices

As scores r2 and rmse are reported. The comparisons are based on r2-scores as they make the scores for different indices comparable.
    

### Lasso
The used library uses cross validation to determine a good value for alpha.

The following script trains for all target indices a Lasso model, then displays r2 score and feature importance information. 

In [None]:
def lasso_info_script(features, name):
    lasso_bf = dict()

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            lasso_bf[target] = apply_lassocv(df, target, list(features[target]), corr_cols, meta_cols, fprint=False)
        else:
            lasso_bf[target] = apply_lassocv(df, target, features, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [lasso_bf[target]['r2'] ,lasso_bf[target]['rmse']]
    
    print('scores')
    display(df_score)

    # l_fi = [lasso_bf[target]['feat_importance'] for target in corr_cols]
    # df_fi = pd.concat(l_fi)

    # l_firk = [lasso_bf[target]['feat_importance_rank'] for target in corr_cols]
    # df_firk = pd.concat(l_firk)

    # print('feature importance')
    # display(df_fi)
    # df_fi.T.plot(kind='bar', figsize=(20,8))
    # print()

    # print('feature importance rank')
    # display(df_firk)
    # print()
    # file = os.path.join('pickle', name +'.obj')
    # f = open(file, 'wb')
    # pickle.dump(lasso_bf ,f)
    #f.close()     


First we apply the script for the individually selected features for each corruption index.

In [None]:
lasso_info_script(selected_features_dict, 'lasso_selected_features_dict')

Next we use for all corruption indices the same set of features - the set of all as promising declared features.

In [None]:
lasso_info_script(best_features_union, 'lasso_best_features_union')

In [None]:
df.dtypes[best_features_union]
df.br_mon.describe()

Now we use only the features that are in all individually selected feature sets.

In [None]:
lasso_info_script(selected_features_intersection, 'lasso_selected_features_intersection')

Findings:

- wbgi_rle (Rule of Law) is by far the most important feature in almost all configurations
- most indices behave similarly for the three feature set configuration but
- ti_cpi is most different: its score is very bad with the smallest feature set. Its most important feature is wbgi_pvs [Political Stability and Absence of Violence/Terrorism, Standard error] and not wbgi_rle
- vdem_jucorrdc is also effected more by different feature sets and its score is lower as well in general.
- all the other indices gain information slightly by more features but they do not rely too much on the chosen setups

### Random Forest

Next we do the same for a Random Forest Regressor. Here initially no cross validation is done. We just use a default setup at first.

In [None]:
def rf_info_script(features, name):
    rf_bf = dict()

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            rf_bf[target] = apply_rf(df, target, list(features[target]), corr_cols, meta_cols, fprint=False)
        else:
            rf_bf[target] = apply_rf(df, target, features, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [rf_bf[target]['r2'] ,rf_bf[target]['rmse']]
    
    print('scores')
    display(df_score)

    # l_fi = [rf_bf[target]['feat_importance'] for target in corr_cols]
    # df_fi = pd.concat(l_fi)

    # l_firk = [rf_bf[target]['feat_importance_rank'] for target in corr_cols]
    # df_firk = pd.concat(l_firk)

    # print('feature importance')
    # display(df_fi)

    # df_fi.T.plot(kind='bar', figsize=(20,8))
    # print()

    # print('feature importance rank')
    # display(df_firk)
    # print()
    # file = os.path.join('pickle', name +'.obj')
    # f = open(file, 'wb')
    # pickle.dump(rf_bf ,f)
    #f.close()    



First we apply again the script for the individually selected features for each corruption index.

In [None]:
rf_info_script(selected_features_dict, 'rf_selected_features_dict')

Next we use for all corruption indices the same set of features - the set of all as promising declared features.

In [None]:
rf_info_script(best_features_union, 'rf_best_features_union')

Now we use only the features that are in all individually selected feature sets.

In [None]:
rf_info_script(selected_features_intersection, 'rf_best_features_union')

The general picture of the results with Random Forest is not that different to the one with Lasso. Some differences are

- ti_cpi is predicted very well now both in comparison with Lasso and with all other indices
- HOWEVER, if only the minimal feature set is used ti_cpi is even worse than with Lasso
- for vdem_execorr the vdem_egal (Egalitarian component index) is the most important feature
- vdem_jucorr is now by far the most difficult to predict index
- although feature importance is not straight-forward comparable between Lasso (weight of coefficients) and Random Forest (Gini) it seems like Random Forst discriminates harder with regard to features

Random Forest performs either similarly or better for most setups / indices allthough no parameter optimization is done by now. So we continue with Random Forst and do hyperparameter optimization for some specific settings next to further optimize the results.

### Grid Search: Random Forest

With cross validation / hyperparameter grid search better parameters are determined. With those optimizations then again models are trained, then the test set is predicted and scores are evaluated.

The script defined below shows a similar report than above.

In [None]:
def rf_gridsearch_info_script(features, name):
    rf_bf = dict()

    param_grid = {
        "randomforestregressor__max_depth": [2, 3, 5, 10, None],
        "randomforestregressor__min_samples_split": [2, 3, 5, 10],
        "randomforestregressor__max_features": ["log2", None]
        }

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            rf_bf[target] = apply_gridsearch_rf(df, target, list(features[target]), param_grid, corr_cols, meta_cols, fprint=False)
        else:
            rf_bf[target] = apply_gridsearch_rf(df, target, features, param_grid, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [rf_bf[target]['r2'] ,rf_bf[target]['rmse']]
    
    print('scores')
    display(df_score)

    # l_fi = [rf_bf[target]['feat_importance'] for target in corr_cols]
    # df_fi = pd.concat(l_fi)
    # rf_bf[target]
    # l_firk = [rf_bf[target]['feat_importance_rank'] for target in corr_cols]
    # df_firk = pd.concat(l_firk)
    # rf_bf[target]['params']
    # l_params = [rf_bf[target]['params'] for target in corr_cols]
    # df_params = pd.concat(l_params)

    # print('feature importance')
    # display(df_fi)

    # df_fi.T.plot(kind='bar', figsize=(20,8))
    # print()

    # print('feature importance rank')
    # display(df_firk)
    
    # file = os.path.join('pickle', name +'.obj')
    # f = open(file, 'wb')
    # pickle.dump(rf_bf ,f)
    #f.close()    



Now we only use for each index the individually selected feature set as we saw above that the results are comparable (so the feature selection process works adequately).

In [None]:
rf_gridsearch_info_script(selected_features_dict, 'rf_grid_selected_features_dict')

For most indices the hyperparameter optimization does not seem to significantly improve the r2-scores. But for vdem_jucorrdc it seems to improve. For vdem_pubcorr and wbgi_cce the improvement is minor.

The feature importance (figure) changes a lot more. Here we see for all but bci_bci that relatively wbgi_rle is not as important anymore. This is most likely due to the max_samples_features being log2 now. One could argue if the original model where wbgi_rle is the main feature is simpler and from the same quality or on the other side that other features are also able to replace wbgi_rle when combined.

In [None]:
rf_bf = rf_gridsearch_info_script(selected_features_union, 'rf_grid_selected_features_union')

In [None]:

f = open('pickle/rf_grid_selected_features_union.obj', 'rb')
rf_bf = pickle.load(f)
f.close()

l_fi = [rf_bf[target]['feat_importance'] for target in corr_cols]
df_fi = pd.concat(l_fi)
l_firk = [rf_bf[target]['feat_importance_rank'] for target in corr_cols]
df_firk = pd.concat(l_firk)
l_params = [rf_bf[target]['params'] for target in corr_cols]
df_params = pd.concat(l_params)

print('feature importance')
display(df_fi)


df_sorted = df_fi.reindex(df_fi.mean().sort_values().index[::-1], axis=1)

display(df_sorted)
df_sorted.T.plot(kind='bar', figsize=(20,8))

df_fi.T.plot(kind='bar', figsize=(20,8))
print()

print('feature importance rank')
display(df_firk)
print()
print(df_sorted.columns)




df_fi = df_fi.reindex(df_fi.mean().sort_values(ascending=False).index, axis=1)
col_names = df_fi.columns
df_fi = df_fi.T.melt(
    ignore_index=False,
    value_vars = ['ti_cpi', 'bci_bci', 'ti_cpi_om', 'wbgi_cce'],
    value_name = 'feature_importance'
).reset_index().rename(columns={'index': 'feature', 'variable': 'corruption_index'})

plt.rcdefaults()
font = {'family' : 'normal',
    'size'   : 14}

plt.rc('font', **font)
plt.figure(figsize=(20,8))
sns.barplot(df_fi, x='feature',  y='feature_importance', hue='corruption_index', palette='magma', width=0.6)
plt.xticks(rotation=90)
plt.legend(loc='upper right')
plt.grid()