In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from data_cleanup import *

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [None]:
df = load_reduced_df()

In [None]:
target_col = 'bci_bci' # 'wbgi_cce'

In [None]:
df

- group by country
- divide in train and test datasets
- check that each contry has enough points
- check how regions are balanced
- 2 versions: all cols and cols without nulls
- run feature selection on all cols and cols wout nulls
- run a simple model ? check how feature selection performs?
- check for highly correlated cols in feat selection?

### Check that each country has enough points

In [None]:
df.groupby('cname').year.nunique().agg(['mean', 'min', 'max'])

In [None]:
df.groupby('cname').year.nunique()[df.groupby('cname').year.nunique() < 5]

In [None]:
df[df.cname == 'Fiji']

### Check how regions are balanced

In [None]:
region_count = pd.DataFrame(df.groupby('sub-region').cname.count()).reset_index(drop=False)
region_count.columns = ['sub-region', 'num of samples']

region_country_count = pd.DataFrame(df.groupby('sub-region').cname.nunique()).reset_index(drop=False)
region_country_count.columns = ['sub-region', 'num of countries']

region_info_df = region_count.merge(region_country_count, on='sub-region')
region_info_df = region_info_df.melt(id_vars=['sub-region'], value_vars=['num of samples', 'num of countries'])

In [None]:
_, ax = plt.subplots(1, 2, figsize=(40, 10))
sns.barplot(data=region_count, y='sub-region', x='num of samples', color='#567ab8', ax=ax[0])
ax[0].set_title('Regions by number of points')
sns.barplot(data=region_country_count, y='sub-region', x='num of countries', color='#567ab8', ax=ax[1])
ax[1].set_title('Regions by number of countries')

### Create test and train datasets

These columns contain dates and are not valid features, hence they are dropped

In [None]:
date_columns = [c for c in df.columns if 'date' in c]
df = df.drop(date_columns, axis=1)

In [None]:
random_state = 424242

In [None]:
country_data = pd.DataFrame(df.groupby('cname')['sub-region'].min())
country_data = country_data.reset_index(drop=False)

In [None]:
country_data

Stratify by region, so that each region is in a train and test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(country_data, country_data['cname'], test_size=0.2, random_state=random_state, stratify=country_data['sub-region'])

In [None]:
X_train.groupby('sub-region').cname.nunique()

In [None]:
X_test.groupby('sub-region').cname.nunique()

Check that each country is either in train or test dataset

In [None]:
set(X_train.cname.unique()).intersection(X_test.cname.unique())

In [None]:
feat_col_start_full = 10
feat_col_start_reduced = 9

In [None]:
df_cols_reduced = df.dropna(how='any', axis=1)

In [None]:
corr_cols = ['bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce']
feats_cols = [c for c in df.columns[feat_col_start_full:-2] if c not in corr_cols]
feats_cols_reduced = [c for c in df_cols_reduced.columns[feat_col_start_reduced:-2] if c not in corr_cols]

In [None]:
def divide_into_test_train(df, target, feats_cols, corr_column=target_col):
    x = df.copy()
    x = x[x.cname.isin(target)]
    y = x.loc[:, corr_column]
    x = x.loc[:, feats_cols]
    return x, y

In [None]:
x_train_full, y_train_full = divide_into_test_train(df, y_train, feats_cols, corr_column=target_col)
x_test_full, y_test_full = divide_into_test_train(df, y_test, feats_cols, corr_column=target_col)

In [None]:
x_train_reduced, y_train_reduced = divide_into_test_train(df_cols_reduced, y_train, feats_cols_reduced, corr_column=target_col)
x_test_reduced, y_test_reduced = divide_into_test_train(df_cols_reduced, y_test, feats_cols_reduced, corr_column=target_col)

### Feature selection

Running feature selection only on reduced (NaNs dropped) df bc feature selection method cannot handle the NaNs

In [None]:
k = 20

In [None]:
feat_selector = SelectKBest(f_regression, k=k)

In [None]:
feat_selector.fit(x_train_reduced, y_train_reduced)

"Best features" according to the select k best:

In [None]:
best_feats = feat_selector.get_feature_names_out(x_train_reduced.columns)
best_feats

Feats and what they mean:
- bci_bcistd - Standard deviation of The Bayesian Corruption Indicator
- fh_cl - Civil Liberties
- fh_pr - Political Rights
- fh_status - Freedom Status
- vdem_delibdem - Deliberative democracy index
- vdem_edcomp_thick - Electoral component index
- vdem_egal - Egalitarian component index
- vdem_egaldem - Egalitarian democracy index
- vdem_exbribe - Executive bribery and corrupt exchanges
- vdem_excrptps - Public sector corrupt exchanges
- vdem_exembez - Executive embezzlement and theft
- vdem_exthftps - Public sector theft
- vdem_libdem - Liberal democracy index
- vdem_liberal - Liberal component index
- vdem_mecorrpt - Media corrupt
- vdem_partipdem - Participatory democracy index
- vdem_polyarchy - Electoral democracy index
- wbgi_ccs - Control of Corruption, Standard Error[s]
- wbgi_gee - Government Effectiveness, Estimate[e]
- wbgi_pv[e][n][s] - Political Stability and Absence of Violence/Terrorism, Estimate[e], Number of Sources[n], Standard Error[s]
- wbgi_rle - Rule of Law, Estimate[e]
- wbgi_rqe - Regulatory Quality, Estimate[e]
- wbgi_vae - Voice and Accountability, Estimate[e]
- ihme_hle_0104[f][m][t] - Healthy Life Years 1-4 Years, Females[f], Males[m], Both sexes[t]
- ihme_lifexp_0104[f][m][t] - Life Expectancy 1-4 Years, Females[f], Males[m], Both sexes[t]

Kick out columns directly related to corruption:

In [None]:
# corruption_cols = ['vdem_exbribe', 'vdem_excrptps', 'vdem_exembez', 'vdem_exthftps', 'vdem_mecorrpt']
corruption_cols = ['bci_bcistd', 'vdem_exbribe', 'vdem_excrptps', 'vdem_exembez', 'vdem_exthftps', 'vdem_mecorrpt', 'wbgi_ccs', 'wbgi_ccn']
best_feats = [f for f in best_feats if f not in corruption_cols]

In [None]:
# sns.pairplot(x_train_reduced, vars=best_feats)

In [None]:
cm = x_train_reduced[best_feats].corr()
cm = cm.where(np.triu(np.ones(cm.shape), k=1).astype(bool))
cm = cm.reset_index(drop=False)
cm = cm.melt(id_vars='index', var_name='second_col', value_name='corr')
cm.columns = ['first_col', 'second_col', 'corr_id']
cm = cm[cm.first_col != cm.second_col]
cm = cm.dropna()

In [None]:
highly_corr = cm[cm.corr_id.abs() > 0.85].copy()

In [None]:
for i, r in highly_corr.iterrows():
    ws_max = 25
    first_ws = ' ' * (ws_max - len(r.first_col))
    second_ws = ' ' * (ws_max - len(r.second_col))
    print(f'{r.first_col}{first_ws}{r.second_col}{second_ws}{r.corr_id:.2f}')

In [None]:
highly_corr.loc[:, 'corr_id'] = highly_corr.corr_id.abs()
highly_corr_agg = highly_corr.groupby('first_col').agg({'second_col': 'count', 'corr_id': 'mean'})
highly_corr_agg

In [None]:
best_feats_reduced = [f for f in best_feats if f not in highly_corr_agg[highly_corr_agg.second_col > 1].index.values]

In [None]:
x_train_reduced[best_feats_reduced].corr()

Feats and what they mean:
- vdem_egal - Egalitarian component index
- vdem_polyarchy - Electoral democracy index
- wbgi_pve - Political Stability and Absence of Violence/Terrorism, Estimate
- wbgi_rle - Rule of Law, Estimate
- wbgi_rqe - Regulatory Quality, Estimate
- wbgi_vae - Voice and Accountability, Estimate

In [None]:
sns.pairplot(x_train_reduced, vars=best_feats_reduced)

### Feature selection evaluation

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as rmse

In [None]:
options = ['full', 'best', 'best_reduced']

In [None]:
rms = {}
rms['full'] = KNeighborsRegressor(n_neighbors=20)
rms['best'] = KNeighborsRegressor(n_neighbors=20)
rms['best_reduced'] = KNeighborsRegressor(n_neighbors=20)

In [None]:
feats = {}
feats['full'] = x_train_reduced.columns
feats['best'] = best_feats
feats['best_reduced'] = best_feats_reduced

In [None]:
opt = 'best_reduced'
rms[opt].fit(x_train_reduced[feats[opt]], y_train_reduced)
y_pred = rms[opt].predict(x_test_reduced[feats[opt]])
print(f'rmse: {rmse(y_test_reduced, y_pred, squared=True)}')
print(f'r2: {r2(y_test_reduced, y_pred)}')

In [None]:
for opt in options:
    print(f'current model: {opt}')
    rms[opt].fit(x_train_reduced[feats[opt]], y_train_reduced)
    y_pred = rms[opt].predict(x_test_reduced[feats[opt]])
    print(f'rmse: {rmse(y_test_reduced, y_pred, squared=True)}')
    print(f'r2: {r2(y_test_reduced, y_pred)}')
    print()  