In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from data_cleanup import *
from feature_selection import *

from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as rmse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, LassoLarsCV

In [2]:
df = load_reduced_df()

In [3]:
corr_cols = ['bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce']

In [4]:
df = drop_date_columns(df)

best_features_dict = {}
selected_features_dict = {}


for target_col in corr_cols:
    X_train, X_test, y_train, y_test = create_traintestsplit(df, target_col=target_col)

    best_features = pre_select(X_train, y_train)
    best_features = filter_corruption(best_features)
    best_features_dict[target_col] = set(best_features)
    
    selected_features_dict[target_col] = set(filter_highly_correleated(X_train, best_features))


In [16]:
best_features_union=list(set.union(*list(best_features_dict.values())))
best_features_intersection=list(set.intersection(*list(best_features_dict.values())))

best_features_intersection

['wbgi_rle', 'wbgi_rqe', 'vdem_egal', 'wbgi_gee', 'vdem_egaldem', 'wbgi_vae']

In [15]:
selected_features_union=list(set.union(*list(selected_features_dict.values())))
selected_features_intersection=list(set.intersection(*list(selected_features_dict.values())))

selected_features_intersection

['wbgi_rle', 'wbgi_vae', 'vdem_egal', 'wbgi_rqe']

In [68]:
coeff_df = pd.DataFrame(columns=best_features_union, index=corr_cols)
for target_col in corr_cols:
    print(f'current target: {target_col}')
    X_train, X_test, y_train, y_test = create_traintestsplit(df, target_col=target_col)

    model = make_pipeline(StandardScaler(with_mean=False), LassoCV(cv=5, random_state=45678, max_iter=50000))
    model.fit(X_train[best_features_union], y_train)
    y_pred = model.predict(X_test[best_features_union])

    #print(f'rmse: {rmse(y_test, y_pred, squared=True)}')
    print(f'r2: {r2(y_test, y_pred)}')
    print()

    # Read out attributes
    coeff_df.loc[target_col] = model['lassocv'].coef_     # dense np.array
     # sparse matrix

    #coeffs = model.intercept_    # probably also relevant

coeff_rel_df = (coeff_df.div(coeff_df.abs().sum(axis=1),axis=0))
display(coeff_df)
display(coeff_rel_df)

current target: bci_bci
r2: 0.7050704238982577

current target: ti_cpi
r2: 0.47718256138836646

current target: vdem_corr
r2: 0.6869093451856315

current target: vdem_execorr
r2: 0.6173037800957077

current target: vdem_jucorrdc
r2: 0.45338248596657993

current target: vdem_pubcorr
r2: 0.6903799011898318

current target: wbgi_cce
r2: 0.8713863076036938



Unnamed: 0,vdem_egaldem,ihme_lifexp_0104m,wbgi_vae,vdem_polyarchy,vdem_partipdem,ihme_hle_0104t,wbgi_pve,wbgi_rqe,fh_pr,wbgi_rle,...,wbgi_pvn,ihme_hle_0104f,ihme_lifexp_0104t,vdem_edcomp_thick,fh_cl,vdem_egal,wbgi_pvs,ihme_hle_0104m,vdem_libdem,ihme_lifexp_0104f
bci_bci,-0.0,-1.753276,-0.0,1.66957,0.0,0.0,0.0,0.0,-0.0,-11.897976,...,2.611668,1.905516,-0.0,0.0,-1.786217,-0.0,1.658957,-0.0,-0.0,0.0
ti_cpi,-0.0,1.570257,5.225695,-0.0,-0.337436,-0.0,0.970847,-0.0,-0.0,6.847798,...,-5.810042,-1.323726,0.0,1.060295,3.097251,0.0,-17.889444,0.0,0.0,-0.0
vdem_corr,-0.0,0.0,0.0,0.065375,0.073207,0.0,-0.016052,0.038399,-0.003562,-0.087717,...,0.037101,0.0,0.009273,0.0,-0.032676,-0.038807,0.034286,0.0,-0.276377,0.004166
vdem_execorr,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.00523,0.003588,-0.0,-0.081417,...,0.0,0.0,0.0,0.002297,-0.03391,-0.042933,0.002684,-0.0,-0.07125,0.0
vdem_jucorrdc,0.0,0.0,0.328707,-0.902509,-0.138881,-0.097789,0.011556,-0.149674,0.084253,0.324884,...,-0.10896,0.0,-0.0,0.380789,0.187211,-0.063026,-0.104595,-0.0,1.198309,0.070394
vdem_pubcorr,-0.0,-0.011211,0.022328,-0.0,0.0,-0.0,-0.016731,0.009217,0.0,-0.057731,...,0.024656,-0.0,-0.0,-0.000524,-0.015654,-0.04569,0.015601,-0.0,-0.098345,-0.007262
wbgi_cce,0.021001,0.058957,0.031523,0.0,0.0,-0.0,0.015673,-0.028794,0.0,0.612789,...,-0.126117,-0.091891,0.0,0.0,0.0,0.0,-0.068968,0.0,0.023938,-0.0


Unnamed: 0,vdem_egaldem,ihme_lifexp_0104m,wbgi_vae,vdem_polyarchy,vdem_partipdem,ihme_hle_0104t,wbgi_pve,wbgi_rqe,fh_pr,wbgi_rle,...,wbgi_pvn,ihme_hle_0104f,ihme_lifexp_0104t,vdem_edcomp_thick,fh_cl,vdem_egal,wbgi_pvs,ihme_hle_0104m,vdem_libdem,ihme_lifexp_0104f
bci_bci,-0.0,-0.053934,-0.0,0.051359,0.0,0.0,0.0,0.0,-0.0,-0.366004,...,0.08034,0.058617,-0.0,0.0,-0.054947,-0.0,0.051033,-0.0,-0.0,0.0
ti_cpi,-0.0,0.034251,0.113985,-0.0,-0.00736,-0.0,0.021176,-0.0,-0.0,0.149367,...,-0.126731,-0.028874,0.0,0.023128,0.067558,0.0,-0.390211,0.0,0.0,-0.0
vdem_corr,-0.0,0.0,0.0,0.077665,0.086968,0.0,-0.019069,0.045617,-0.004231,-0.104206,...,0.044075,0.0,0.011016,0.0,-0.038819,-0.046102,0.040731,0.0,-0.328331,0.004949
vdem_execorr,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.013664,0.009374,-0.0,-0.212722,...,0.0,0.0,0.0,0.006002,-0.088598,-0.112174,0.007012,-0.0,-0.186157,0.0
vdem_jucorrdc,0.0,0.0,0.06882,-0.188954,-0.029077,-0.020474,0.002419,-0.031336,0.01764,0.068019,...,-0.022812,0.0,-0.0,0.079724,0.039195,-0.013196,-0.021899,-0.0,0.250884,0.014738
vdem_pubcorr,-0.0,-0.024137,0.048072,-0.0,0.0,-0.0,-0.036021,0.019844,0.0,-0.124292,...,0.053084,-0.0,-0.0,-0.001128,-0.033703,-0.098369,0.033589,-0.0,-0.211732,-0.015635
wbgi_cce,0.013527,0.037975,0.020304,0.0,0.0,-0.0,0.010095,-0.018546,0.0,0.394706,...,-0.081234,-0.059189,0.0,0.0,0.0,0.0,-0.044423,0.0,0.015419,-0.0


In [67]:
(coeff_df.abs().sum(axis=1))

bci_bci          32.507785
ti_cpi           45.845555
vdem_corr         0.841764
vdem_execorr      0.382739
vdem_jucorrdc     4.776351
vdem_pubcorr      0.464477
wbgi_cce          1.552520
dtype: float64

In [69]:
coeff_df = pd.DataFrame(columns=selected_features_union, index=corr_cols)
for target_col in corr_cols:
    print(f'current target: {target_col}')
    X_train, X_test, y_train, y_test = create_traintestsplit(df, target_col=target_col)

    model = make_pipeline(StandardScaler(with_mean=False), LassoCV(cv=5, random_state=45678, max_iter=50000))
    model.fit(X_train[selected_features_union], y_train)
    y_pred = model.predict(X_test[selected_features_union])

    #print(f'rmse: {rmse(y_test, y_pred, squared=True)}')
    print(f'r2: {r2(y_test, y_pred)}')
    print()

    # Read out attributes
    coeff_df.loc[target_col] = model['lassocv'].coef_     # dense np.array
     # sparse matrix

    #coeffs = model.intercept_    # probably also relevant

coeff_rel_df = (coeff_df.div(coeff_df.abs().sum(axis=1),axis=0))
display(coeff_df)
display(coeff_rel_df)

current target: bci_bci
r2: 0.6466641222520171

current target: ti_cpi
r2: 0.46360571794757555

current target: vdem_corr
r2: 0.6562220596242767

current target: vdem_execorr
r2: 0.6182698942138367

current target: vdem_jucorrdc
r2: 0.3371596605986745

current target: vdem_pubcorr
r2: 0.6718590848996285

current target: wbgi_cce
r2: 0.8488528048261852



Unnamed: 0,wbgi_rle,wbgi_pvn,wbgi_pve,ihme_lifexp_0104t,wbgi_rqe,vdem_egal,wbgi_pvs,vdem_egaldem,ihme_lifexp_0104m,wbgi_vae,vdem_polyarchy,vdem_liberal
bci_bci,-18.419837,1.967984,0.44484,0.0,-0.177808,-0.0,1.078584,-0.0,-0.902103,0.0,4.026468,0.418914
ti_cpi,7.535761,-6.434712,0.558037,-2.089446,-0.34313,0.0,-18.306357,-1.486965,2.648634,3.542843,-0.740902,-0.0
vdem_corr,-0.182805,0.032443,-0.011705,0.0,0.004748,0.0,0.026822,-0.145597,0.0,0.047873,0.057537,-0.082418
vdem_execorr,-0.1565,0.003027,-0.00293,-0.0,0.00582,-0.007664,0.004813,-0.107138,-0.008171,0.065691,0.025077,-0.117675
vdem_jucorrdc,0.621633,-0.002237,0.005962,0.0,0.0,0.0,-0.0,0.022866,0.0,0.0,0.0,0.192622
vdem_pubcorr,-0.149048,0.01543,-0.015988,-0.03177,-0.0,-0.017761,0.006877,-0.093146,-0.0,0.050207,-0.0,-0.071347
wbgi_cce,0.990559,-0.068077,0.005227,0.0,0.0,0.0,-0.01897,0.022868,0.01406,0.030485,0.0,-0.0


Unnamed: 0,wbgi_rle,wbgi_pvn,wbgi_pve,ihme_lifexp_0104t,wbgi_rqe,vdem_egal,wbgi_pvs,vdem_egaldem,ihme_lifexp_0104m,wbgi_vae,vdem_polyarchy,vdem_liberal
bci_bci,-0.671362,0.071729,0.016213,0.0,-0.006481,-0.0,0.039312,-0.0,-0.03288,0.0,0.146756,0.015268
ti_cpi,0.172495,-0.147292,0.012774,-0.047828,-0.007854,0.0,-0.419036,-0.034037,0.060628,0.081096,-0.016959,-0.0
vdem_corr,-0.308819,0.054808,-0.019774,0.0,0.00802,0.0,0.045311,-0.245962,0.0,0.080874,0.0972,-0.139233
vdem_execorr,-0.310205,0.005999,-0.005807,-0.0,0.011536,-0.015192,0.00954,-0.212362,-0.016196,0.13021,0.049706,-0.233249
vdem_jucorrdc,0.735383,-0.002646,0.007052,0.0,0.0,0.0,-0.0,0.02705,0.0,0.0,0.0,0.227869
vdem_pubcorr,-0.330064,0.034169,-0.035404,-0.070354,-0.0,-0.039332,0.015229,-0.206268,-0.0,0.111182,-0.0,-0.157997
wbgi_cce,0.861172,-0.059184,0.004544,0.0,0.0,0.0,-0.016492,0.019881,0.012223,0.026503,0.0,-0.0
