In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os 

from data_cleanup import *
from feature_selection import *
from model_ import *

from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as rmse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, LassoLarsCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
corr_cols = ['ti_cpi', 'bci_bci', 'ti_cpi_om', 'wbgi_cce']
meta_cols = ['ccode', 'ccode_qog', 'ccodealp', 'ccodealp_year', 'ccodecow', 'cname', 'cname_qog', 'cname_year', 'version', 'year', 'region', 'sub-region']
df = load_reduced_df(corr_cols)

In [None]:
display(df.shape)

### Feature selection

In [None]:
df = drop_date_columns(df)

best_features_dict = {}
selected_features_dict = {}

for target_col in corr_cols:
    X_train, X_test, y_train, y_test = create_traintestsplit(df, corr_cols = corr_cols, meta_cols=meta_cols, target_col=target_col)
    
    best_features = pre_select(X_train, y_train)
    best_features_dict[target_col] = set(best_features)
    df_train = X_train[best_features].copy()
    df_train[target_col]=y_train
    mce = MultiCollinearityEliminator(df_train, target_col, 0.85)
    feaures_no_collinearity = list(mce.autoEliminateMulticollinearity().columns)
    feaures_no_collinearity.remove(target_col)
    selected_features_dict[target_col] = set(feaures_no_collinearity)
selected_features_dict

In [None]:
best_features_union=list(set.union(*list(best_features_dict.values())))
best_features_intersection=list(set.intersection(*list(best_features_dict.values())))

best_features_intersection

print(df[best_features_union].isna().sum().sort_values(ascending=False))


In [None]:
selected_features_union=list(set.union(*list(selected_features_dict.values())))
selected_features_intersection=list(set.intersection(*list(selected_features_dict.values())))

## Modeling

To model the data we tried out a Lasso linear model and a random forest regressor. Based on our pre selection process we train our models on different feature configurations:

    - the individual selected features for a particular index
    - the intersection of all selected features for all indices
    - the union of selected features for all indices
    - the union of selected features for all indices without filtering based on collinearity

To report the accuracy of the model we use r2 and the rmse as metrics.
    
### Lasso
The used library uses cross validation to determine a good value for alpha.

In [None]:
def lasso_info_script(features, name):
    lasso_bf = dict()

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            lasso_bf[target] = apply_lassocv(df, target, list(features[target]), corr_cols, meta_cols, fprint=False)
        else:
            lasso_bf[target] = apply_lassocv(df, target, features, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [lasso_bf[target]['r2'] ,lasso_bf[target]['rmse']]
    
    print('scores:' + name)
    display(df_score)
    
lasso_info_script(selected_features_dict, ' Selected Features')
lasso_info_script(selected_features_intersection, 'Features intersection')
lasso_info_script(selected_features_union, 'selected Features union')
lasso_info_script(best_features_union, ' selected Features union without collinearity filter')

# print(len(best_features_union))
# print(len(selected_features_intersection))

We can see here that the models perform all very similar with a slight edge for the models with more features. This was to be expected, due to the fact, that the features where preselected. It can also be seen, that the the collinearity does not seem to pose a problem to the model.

### Random Forest

Next we do the same for a Random Forest Regressor. Here initially no cross validation is done. We just use a default setup at first.

In [None]:
def rf_info_script(features, name):
    rf_bf = dict()

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            rf_bf[target] = apply_rf(df, target, list(features[target]), corr_cols, meta_cols, fprint=False)
        else:
            rf_bf[target] = apply_rf(df, target, features, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [rf_bf[target]['r2'] ,rf_bf[target]['rmse']]
    
    print('scores:' + name)
    display(df_score)

rf_info_script(selected_features_dict, ' Selected Features')
rf_info_script(selected_features_intersection, 'Features intersection')
rf_info_script(selected_features_union, 'selected Features union')
rf_info_script(best_features_union, ' selected Features union without collinearity filter')

Random Forest performs very  similarly to the lasso model allthough no parameter optimization is done by now. So we continue with Random Forst and do hyperparameter optimization for some specific settings next to further optimize the results.

### Grid Search: Random Forest

With cross validation / hyperparameter grid search better parameters are determined. With those optimizations then again models are trained, then the test set is predicted and scores are evaluated. We where not able to set a random state for this part, to accommodate this fact we saved the results via pickle so we can work on a consistent interpretation of the data.

In [None]:
def rf_gridsearch_info_script(features, name):
    rf_bf = dict()

    param_grid = {
        "randomforestregressor__max_depth": [2, 3, 5, 10, None],
        "randomforestregressor__min_samples_split": [2, 3, 5, 10],
        "randomforestregressor__max_features": ["log2", None]
        }

    df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
    for target in corr_cols:
        if isinstance(features, dict):
            rf_bf[target] = apply_gridsearch_rf(df, target, list(features[target]), param_grid, corr_cols, meta_cols, fprint=False)
        else:
            rf_bf[target] = apply_gridsearch_rf(df, target, features, param_grid, corr_cols, meta_cols, fprint=False)
        df_score.loc[target,] = [rf_bf[target]['r2'] ,rf_bf[target]['rmse']]
    
    print('scores')
    display(df_score)
    
    # file = os.path.join('pickle', name +'.obj')
    # f = open(file, 'wb')
    # pickle.dump(rf_bf ,f)
    #f.close()    

rf_bf = rf_gridsearch_info_script(selected_features_union, 'selected Features union')

In [None]:

f = open('pickle/rf_grid_selected_features_union.obj', 'rb')
rf_bf = pickle.load(f)
f.close()

df_score = pd.DataFrame(columns=['r2', 'rmse'], index=corr_cols)
for target in corr_cols:
    df_score.loc[target,] = [rf_bf[target]['r2'] ,rf_bf[target]['rmse']]

print('score: Grit Search Random Forrest selected Features union')
display(df_score)



In the scores table we can see, that after hyperparameter optimization the metrics of our model improved again a bit. An r2 of above 0.7 has been reached for 3 out of the four corruption indices and the rmse values seem reasonable as well in regards to the scale our corruption indices operate on. 
- ti_cpi: 0-100
- bci_bci: 0-100
- ti_cpm: 0-10
- wbgi_cce: -2.5-2.5

In [None]:
l_fi = [rf_bf[target]['feat_importance'] for target in corr_cols]
df_fi = pd.concat(l_fi)

df_fi = df_fi.reindex(df_fi.mean().sort_values(ascending=False).index, axis=1)
col_names = df_fi.columns
df_fi = df_fi.T.melt(
    ignore_index=False,
    value_vars = ['ti_cpi', 'bci_bci', 'ti_cpi_om', 'wbgi_cce'],
    value_name = 'feature_importance'
).reset_index().rename(columns={'index': 'feature', 'variable': 'corruption_index'})

plt.rcdefaults()
font = {'family' : 'normal',
    'size'   : 14}

plt.rc('font', **font)
plt.figure(figsize=(20,8))
sns.barplot(df_fi, x='feature',  y='feature_importance', hue='corruption_index', palette='magma', width=0.6)
plt.xticks(rotation=90)
plt.legend(loc='upper right')
plt.grid()
plt.title('Feature importance')

Looking at the feature importance in the plot above we can see that the features have comparable levels of relevance to the different corruption indexes.
The plot shows that our model sees  political and civil freedom aswell as live expectancy as good indicators for corruption levels. A full list of the most important features can be seen below.

| code        | Description                 |
| ----------------- | ------------------------------------------------ |
| fh_pr_1.0         | Political Rights Rating                          |
| fh_ipolity2       | Level of Democracy                               |
| ihme_lifexp_0104t | Life Expectancy, Both sexes, Age 1-4 years       |
| fh_cl_1.0         | Civil Liberties                                  |
| ihme_lifexp_0104m | Life Expectancy, Male, Age 1-4 years             |
| br_mon            | Is the country a monarchy                        |
| cpds_vper_0.0     | Share of votes: personalist                      |
| gd_ptss_1.0       | Political Terror Scale - US State Department     |
| kun_cluster_5.0   | Cluster memberships based on means               |
| wel_sys_1.0       | Political System Type                            |
| cpds_lall_0.0     | Share of seats in parliament: electoral alliance |
| fh_status_1.0     | Freedom Status                                   |
| ciri_injud_2.0    | Independence of the Judiciary                    |
| fhp_status5_1.0   | Freedom of the Press, Status (2001-2016)         |
| wel_scalezone_4.0 | Scalezone on Citizen Rights                      |
| cpds_chg_0.0      | Number of changes in government per year         |