In [1]:
# Data manipulation imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# SciKit Learn Processing/ Modeling Imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA

## Import Data
Import data, train-test split, and begin modeling

In [2]:
gov = pd.read_csv('gov.csv')
gov.head()

Unnamed: 0,country,population,area_square_km,density_square_km,average_years_of_school,auto_demo_polity,gdp_per_capita,happiness_index,freedom_index,less_corruption_index,life_expectancy_years,inequality_gini,median_household_income_dollars,median_per_capita_income_dollars,median_individual_income_dollars
0,China,1444216.107,9706961,148.7815,2.77,-7.0,12009.443,5.14,0.51697,0.02781,75.928,38.5,6180.0,1786.0,10410.0
1,India,1393409.038,3287590,423.8391,1.24,9.0,5722.5186,4.565,0.39786,0.08492,68.607,35.7,3168.0,616.0,2130.0
2,United States,332915.073,9372610,35.52,10.61,10.0,53533.281,7.119,0.54604,0.1589,78.91,41.1,43585.0,15480.0,65760.0
3,Indonesia,276361.783,1904569,145.1046,2.26,9.0,9781.207,5.399,0.46611,0.0,70.768,37.8,2199.0,541.0,4050.0
4,Pakistan,225199.937,881912,255.3542,1.31,7.0,4815.9243,5.194,0.12102,0.10464,66.577,33.5,4060.0,480.0,1530.0


In [3]:
# Set X and y
X = gov.drop(columns = ['country', 'less_corruption_index'], axis = 1) #drop identifier and target
y = gov['less_corruption_index']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [4]:
# Scale data for modeling
ss = StandardScaler()

Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)
Z_train = pd.DataFrame(Z_train, columns = X.columns)
Z_test = pd.DataFrame(Z_test, columns = X.columns)

Z_train.head() #to view that the transformation worked

Unnamed: 0,population,area_square_km,density_square_km,average_years_of_school,auto_demo_polity,gdp_per_capita,happiness_index,freedom_index,life_expectancy_years,inequality_gini,median_household_income_dollars,median_per_capita_income_dollars,median_individual_income_dollars
0,-0.322188,-0.416963,-0.165011,-0.174481,0.676517,-0.388489,-0.447857,-0.498715,0.674178,-0.601697,-0.364835,-0.366299,-0.673335
1,-0.280144,-0.417364,0.282065,-1.126251,-0.891519,-0.902056,-0.822614,-1.20198,-1.353181,0.436937,-0.709376,-0.626329,-0.350771
2,-0.278232,-0.362768,-0.203188,-1.086759,0.328064,-0.41708,-0.63481,-1.08736,0.399819,-0.654286,-0.240533,-0.418509,-0.350771
3,-0.312548,-0.319826,-0.28098,2.740067,0.850743,0.8758,1.529602,1.255405,1.155316,-0.207279,1.760649,1.723122,1.261537
4,-0.249361,-0.317265,-0.214983,0.046677,-0.020388,-0.449549,0.415529,0.29989,0.428651,1.002269,-0.399146,-0.411167,-0.629913


## Modeling
Fit 8 different models with default parameters and then tune the models with the best performing testing scores

In [5]:
np.random.seed(1) #set random seed

# Linear regression model
lr = LinearRegression().fit(Z_train, y_train)
lr_preds_train, lr_preds_test = lr.predict(Z_train), lr.predict(Z_test)

# Ridge model
ridge = Ridge().fit(Z_train, y_train)
ridge_preds_train, ridge_preds_test = ridge.predict(Z_train), ridge.predict(Z_test)

# Lasso model
lasso = Lasso().fit(Z_train, y_train)
lasso_preds_train, lasso_preds_test = lasso.predict(Z_train), lasso.predict(Z_test)

# Random Forest model
rf = RandomForestRegressor(random_state = 1).fit(Z_train, y_train) #set random_state for reprodcucability
rf_preds_train, rf_preds_test = rf.predict(Z_train), rf.predict(X_test)

# KNN model
knn = KNeighborsRegressor().fit(Z_train, y_train)
knn_preds_train, knn_preds_test = knn.predict(Z_train), knn.predict(Z_test)

# Bagging model
bagging = BaggingRegressor().fit(Z_train, y_train)
bagging_preds_train, bagging_preds_test = bagging.predict(Z_train), bagging.predict(Z_test)

# SVR model
svr = SVR().fit(Z_train, y_train)
svr_preds_train, svr_preds_test = svr.predict(Z_train), svr.predict(Z_test)

# PCA model
pca = PCA(random_state = 1).fit(Z_train, y_train)
Z_pca_train, Z_pca_test = pca.transform(Z_train), pca.transform(Z_test)
Z_pca_train = pd.DataFrame(Z_pca_train, columns = X.columns)

# Fit on Z_pca_train - credit below to PCA lecture
lr_pca = LinearRegression().fit(Z_pca_train, y_train)
lr_pca_preds_train, lr_pca_Preds_test = lr_pca.predict(Z_train), lr_pca.predict(Z_test)

## Score Models
Check r squared scores for each of the base models - cross validated scores were very low so not used as I think they are an unfair representation of what is happening. Start by figuring out which model works best with principal component analysis:

In [6]:
# Tuned PCA model gridsearch
pca_params = {
    'n_components' : [2, 5, None],
    'copy' : [True, False],
    'random_state' : [1],
    'tol' : [0.0, .1, 1]
}

pca_tuned = GridSearchCV(pca,
                         pca_params,
                         cv = 5,
                         verbose = 1)

pca_tuned = pca_tuned.fit(X_train, y_train)
Z_pca_tuned_train = pca_tuned.transform(Z_train)
Z_pca_tuned_test = pca_tuned.transform(Z_test)

#convert to df
Z_pca_tuned_train = pd.DataFrame(Z_pca_tuned_train, columns = X.columns)
Z_pca_tuned_test = pd.DataFrame(Z_pca_tuned_test, columns = X.columns)

# Fit on Z_train - credit below to GA PCA lecture
lr_pca_tuned = LinearRegression().fit(Z_pca_tuned_train, y_train)
lasso_pca_tuned = Lasso().fit(Z_pca_tuned_train, y_train)
ridge_pca_tuned = Ridge().fit(Z_pca_tuned_train, y_train)
rf_pca_tuned = RandomForestRegressor().fit(Z_pca_tuned_train, y_train)
knn_pca_tuned = KNeighborsRegressor().fit(Z_pca_tuned_train, y_train)
bagging_pca_tuned = BaggingRegressor().fit(Z_pca_tuned_train, y_train)
svr_pca_tuned = SVR().fit(Z_pca_tuned_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [7]:
# Tuned pca scoring - linear regression
print(f'Lr pca tuned training score: {round(lr_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Lr pca tuned testing score: {round(lr_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - lasso scoring
print(f'Lasso pca tuned training score: {round(lasso_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Lasso pca tuned testing score: {round(lasso_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - ridge scoring
print(f'Ridge pca tuned training score: {round(ridge_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Ridge pca tuned testing score: {round(ridge_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - random forest
print(f'Rf pca tuned training score: {round(rf_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Rf pca tuned testing score: {round(rf_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - knn
print(f'Knn pca tuned training score: {round(knn_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Knn pca tuned testing score: {round(knn_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - bagging
print(f'Bagging pca tuned training score: {round(bagging_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Bagging pca tuned testing score: {round(bagging_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

# Pca scoring - svr
print(f'Svr pca tuned training score: {round(svr_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Svr pca tuned testing score: {round(svr_pca_tuned.score(Z_pca_tuned_test, y_test), 4)}')

Lr pca tuned training score: 0.6263
Lr pca tuned testing score: 0.429 

Lasso pca tuned training score: 0.0
Lasso pca tuned testing score: -0.0225 

Ridge pca tuned training score: 0.6262
Ridge pca tuned testing score: 0.4269 

Rf pca tuned training score: 0.9261
Rf pca tuned testing score: 0.4617 

Knn pca tuned training score: 0.6329
Knn pca tuned testing score: 0.1443 

Bagging pca tuned training score: 0.8724
Bagging pca tuned testing score: 0.3612 

Svr pca tuned training score: -0.001
Svr pca tuned testing score: -0.0134


Random forest turns out to be the best base model for pca so use below:

In [8]:
# Linear regression scoring
lr_training_score = metrics.r2_score(y_train, lr_preds_train)
lr_testing_score = metrics.r2_score(y_test, lr_preds_test)
print(f'Lr training score: {lr_training_score}')
print(f'Lr testing score: {lr_testing_score} \n')

# Lasso scoring
lasso_training_score = metrics.r2_score(y_train, lasso_preds_train)
lasso_testing_score = metrics.r2_score(y_test, lasso_preds_test)
print(f'Lasso training score: {lasso_training_score}')
print(f'Lasso testing score: {lasso_testing_score} \n')

# Ridge scoring
ridge_training_score = metrics.r2_score(y_train, ridge_preds_train)
ridge_testing_score = metrics.r2_score(y_test, ridge_preds_test)
print(f'Ridge training score: {ridge_training_score}')
print(f'Ridge testing score: {ridge_testing_score} \n')

# Random forest scoring
rf_training_score = metrics.r2_score(y_train, rf_preds_train)
rf_testing_score = metrics.r2_score(y_test, rf_preds_test)
print(f'Rf training score: {rf_training_score}')
print(f'Rf testing score: {rf_testing_score} \n')

# Knn scoring
knn_training_score = metrics.r2_score(y_train, knn_preds_train)
knn_testing_score = metrics.r2_score(y_test, knn_preds_test)
print(f'Knn training score: {knn_training_score}')
print(f'Knn testing score: {knn_testing_score} \n')

# Bagging scoring
bagging_training_score = metrics.r2_score(y_train, bagging_preds_train)
bagging_testing_score = metrics.r2_score(y_test, bagging_preds_test)
print(f'Bagging training score: {bagging_training_score}')
print(f'Bagging testing score: {bagging_testing_score} \n')

# Svr scoring
svr_training_score = metrics.r2_score(y_train, svr_preds_train)
svr_testing_score = metrics.r2_score(y_test, svr_preds_test)
print(f'Svr training score: {svr_training_score}')
print(f'Svr testing score: {svr_testing_score} \n')

# Pca scoring
print(f'Rf pca tuned training score: {round(rf_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Rf pca tuned testing score: {round(rf_pca_tuned.score(Z_pca_tuned_test, y_test), 4)} \n')

Lr training score: 0.6263095320540817
Lr testing score: 0.4290331150363158 

Lasso training score: 0.0
Lasso testing score: -0.022450152580491656 

Ridge training score: 0.6262304029964222
Ridge testing score: 0.4269432645470165 

Rf training score: 0.9257112853994439
Rf testing score: -2.3997258440104674 

Knn training score: 0.6328655590543724
Knn testing score: 0.14426616982842 

Bagging training score: 0.9029603027750391
Bagging testing score: 0.452357415035469 

Svr training score: 0.6728948602561876
Svr testing score: 0.29947197676639625 

Rf pca tuned training score: 0.9261
Rf pca tuned testing score: 0.4617 



## Preliminary Model Perfromance
The three best testing data models were PCA with rf (.461), linear regression (.429), and Ridge (.427). Below parameters are tuned for optimal performance

In [9]:
# Tuned linear regression model gridsearch
lr_params = {
    'fit_intercept' : [True, False],
}

lr_tuned = GridSearchCV(lr,
                        lr_params,
                        verbose = 0)

lr_tuned = lr_tuned.fit(Z_train, y_train)

In [10]:
# Score lr_tuned
lr_tuned_preds_train, lr_tuned_preds_test = lr_tuned.predict(Z_train), lr_tuned.predict(Z_test)

lr_tuned_training_score = metrics.r2_score(y_train, lr_tuned_preds_train)
lr_tuned_testing_score = metrics.r2_score(y_test, lr_tuned_preds_test)
print(f'Lr training score: {lr_training_score}')
print(f'Lr testing score: {lr_testing_score} \n')

Lr training score: 0.6263095320540817
Lr testing score: 0.4290331150363158 



No improvement over default parameters for linear regression

In [11]:
# Tuned ridge model gridsearch
ridge_params = {
    'alpha' : [.01, .1, 1, 10, 100],
    'fit_intercept' : [True, False],
    'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge_tuned = GridSearchCV(ridge,
                           ridge_params,
                           verbose = 1)

ridge_tuned = ridge_tuned.fit(Z_train, y_train)

# Score ridge_tuned
ridge_tuned_preds_train, ridge_tuned_preds_test = ridge_tuned.predict(Z_train), ridge_tuned.predict(Z_test)

ridge_tuned_training_score = metrics.r2_score(y_train, ridge_tuned_preds_train)
ridge_tuned_testing_score = metrics.r2_score(y_test, ridge_tuned_preds_test)
print(f'Ridge training score: {ridge_training_score}')
print(f'Ridge testing score: {ridge_testing_score} \n')

Fitting 5 folds for each of 70 candidates, totalling 350 fits
Ridge training score: 0.6262304029964222
Ridge testing score: 0.4269432645470165 



No improvement over default parameters for ridge eiter and the rf pca model has already been tuned so rf pca is the best model

## Conclusions
There is no improvmenet on the linear regression or ridge models after tuning so PCA with random forest is the best performing model with the highest testing score of .463 - this will be the final production model. A neural network could likely beat this accuracy but these results are satisfactory within the context of predicintg for such a dififcult target as corruption and lend themselves better to inference. Since principal component analysis does not output meaningful weights for features the best interpretation of results will be correlations from the initial EDA until there is a more involved statistical investigation which is beyond the scope of this study for right now