In [1]:
# Data manipulation imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# SciKit Learn Processing/ Modeling Imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA

## Import Data
Import data, train-test split, and begin modeling

In [2]:
gov = pd.read_csv('gov.csv')
gov.head()

Unnamed: 0,country,life_expectancy,gdp_per_cap,average_years_of_school,total_population,auto_demo,happiness,freedom,less_corruption
0,Afghanistan,63.377,12381.3665,0.35,34414000.0,-1.0,3.575,0.23414,0.09719
1,Albania,78.025,11228.951,3.31,2891000.0,9.0,4.959,0.35733,0.06413
2,Algeria,76.09,12077.444,0.78,39728000.0,2.0,5.605,0.28579,0.17383
3,Angola,59.398,5530.3374,3.26,27884000.0,-2.0,4.033,0.10384,0.07122
4,Argentina,76.068,16414.078,5.92,43075000.0,9.0,6.574,0.44974,0.08484


In [3]:
# Set X and y for each imputation
X = gov.drop(columns = ['country', 'less_corruption'], axis = 1) #drop identifier and target
y = gov['less_corruption']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [4]:
# Scale data for each imptation
ss = StandardScaler()

Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

## Modeling
Fit 8 different models with default parameters and then tune the models with the best performing testing scores

In [5]:
np.random.seed(1) #set random seed

# Linear regression model
lr = LinearRegression().fit(X_train, y_train)
lr_preds_train, lr_preds_test = lr.predict(X_train), lr.predict(X_test)

# Ridge model
ridge = Ridge().fit(X_train, y_train)
ridge_preds_train, ridge_preds_test = ridge.predict(X_train), ridge.predict(X_test)

# Lasso model
lasso = Lasso().fit(X_train, y_train)
lasso_preds_train, lasso_preds_test = lasso.predict(X_train), lasso.predict(X_test)

# Random Forest model
rf = RandomForestRegressor(random_state = 1).fit(X_train, y_train) #set random_state for reprodcucability
rf_preds_train, rf_preds_test = rf.predict(X_train), rf.predict(X_test)

# KNN model
knn = KNeighborsRegressor().fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_preds_train, knn_preds_test = knn.predict(X_train), knn.predict(X_test)

# Bagging model
bagging = BaggingRegressor().fit(X_train, y_train)
bagging_preds_train, bagging_preds_test = bagging.predict(X_train), bagging.predict(X_test)

# SVR model
svr = SVR().fit(X_train, y_train)
svr_preds_train, svr_preds_test = svr.predict(X_train), svr.predict(X_test)

# PCA model
pca = PCA(random_state = 1).fit(X_train, y_train)
Z_pca_train = pca.transform(Z_train)
Z_pca_test = pca.transform(Z_test)

# Fit on Z_train - credit below to PCA lecture
lr_pca = LinearRegression().fit(Z_pca_train, y_train)

  return linalg.solve(A, Xy, sym_pos=True,


## Score Models
Check cross-validated performance and tune for the best models

In [6]:
# Linear regression scoring
lr_training_score = cross_val_score(lr, X_train, y_train, cv = 5).mean()
lr_testing_score = cross_val_score(lr, X_test, y_test, cv = 5).mean()
print(f'Lr training score: {lr_training_score}')
print(f'Lr testing score: {lr_testing_score} \n')

# Lasso scoring
lasso_training_score = cross_val_score(lasso, X_train, y_train, cv = 5).mean()
lasso_testing_score = cross_val_score(lasso, X_test, y_test, cv = 5).mean()
print(f'Lasso training score: {lasso_training_score}')
print(f'Lasso testing score: {lasso_testing_score} \n')

# Ridge scoring
ridge_training_score = cross_val_score(ridge, X_train, y_train, cv = 5).mean()
ridge_testing_score = cross_val_score(ridge, X_test, y_test, cv = 5).mean()
print(f'Ridge training score: {ridge_training_score}')
print(f'Ridge testing score: {ridge_testing_score} \n')

# Random forest scoring
rf_training_score = cross_val_score(rf, X_train, y_train, cv = 5).mean()
rf_testing_score = cross_val_score(rf, X_test, y_test, cv = 5).mean()
print(f'Rf training score: {rf_training_score}')
print(f'Rf testing score: {rf_testing_score} \n')

# Knn scoring
knn_training_score = cross_val_score(knn, X_train, y_train, cv = 5).mean()
knn_testing_score = cross_val_score(knn, X_test, y_test, cv = 5).mean()
print(f'Knn training score: {knn_training_score}')
print(f'Knn testing score: {knn_testing_score} \n')

# Bagging scoring
bagging_training_score = cross_val_score(bagging, X_train, y_train, cv = 5).mean()
bagging_testing_score = cross_val_score(bagging, X_test, y_test, cv = 5).mean()
print(f'Bagging training score: {bagging_training_score}')
print(f'Bagging testing score: {bagging_testing_score} \n')

# Svr scoring
svr_training_score = cross_val_score(svr, X_train, y_train, cv = 5).mean()
svr_testing_score = cross_val_score(svr, X_test, y_test, cv = 5).mean()
print(f'Svr training score: {svr_training_score}')
print(f'Svr testing score: {svr_testing_score} \n')

# Pca scoring
print(f'Pca training score: {round(lr_pca.score(Z_pca_train, y_train), 4)}')
print(f'Pca testing score: {round(lr_pca.score(Z_pca_test, y_test), 4)}')

Lr training score: 0.1823804019359104
Lr testing score: -1.1869449690695448 

Lasso training score: 0.09754218299594224
Lasso testing score: -0.8968066386780821 

Ridge training score: 0.16205899733188717
Ridge testing score: -1.08251461417098 



  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


Rf training score: 0.29006186807437234
Rf testing score: -0.26355907996686473 

Knn training score: -0.5663857819850555
Knn testing score: -0.8977225052930189 

Bagging training score: 0.2597198925399785
Bagging testing score: -0.3777639341492791 

Svr training score: -0.29101650093995396
Svr testing score: -1.056734369686819 

Pca training score: 0.5443
Pca testing score: 0.2555


The models with the best performing testing scores are random forest (-.3946), bagging (-.6366), and pca (.2555). I will tune these models for a better fit

In [7]:
# Tuned random forest model gridsearch
rf_params = {
    'max_depth' : [2, 5, 8, None],
    'min_samples_split' : [2, 5, 8],
    'bootstrap': [True, False]
}

rf_tuned = GridSearchCV(rf,
                        rf_params,
                        cv = 5,
                        verbose = 1)

rf_tuned.fit(X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   11.2s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1),
             param_grid={'bootstrap': [True, False],
                         'max_depth': [2, 5, 8, None],
                         'min_samples_split': [2, 5, 8]},
             verbose=1)

In [8]:
# Tuned bagging model gridsearch
bagging_params = {
    'base_estimator' : [rf, None],
    'n_estimators' : [10, 100],
    'bootstrap' : [True, False]
}

bagging_tuned = GridSearchCV(bagging,
                             bagging_params,
                             cv = 5,
                             verbose = 1)

bagging_tuned.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.0min finished


GridSearchCV(cv=5, estimator=BaggingRegressor(),
             param_grid={'base_estimator': [RandomForestRegressor(random_state=1),
                                            None],
                         'bootstrap': [True, False],
                         'n_estimators': [10, 100]},
             verbose=1)

In [9]:
# Tuned PCA model gridsearch
pca_params = {
    'n_components' : [2, 5, None],
    'copy' : [True, False],
    'random_state' : [1],
    'tol' : [0.0, .1, 1]
}

pca_tuned = GridSearchCV(pca,
                         pca_params,
                         cv = 5,
                         verbose = 1)

pca_tuned = pca_tuned.fit(X_train, y_train)
Z_pca_tuned_train = pca_tuned.transform(Z_train)
Z_pca_tuned_test = pca_tuned.transform(Z_test)

# Fit on Z_train - credit below to PCA lecture
lr_pca_tuned = LinearRegression().fit(Z_pca_tuned_train, y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    0.3s finished
  array_stds = np.sqrt(np.average((array -


In [10]:
# Random forest scoring
rf_tuned_training_score = cross_val_score(rf_tuned, X_train, y_train, cv = 5).mean()
rf_tuned_testing_score = cross_val_score(rf_tuned, X_test, y_test, cv = 5).mean()
print(f'Rf tuned training score: {rf_tuned_training_score}')
print(f'Rf tuned testing score: {rf_tuned_testing_score} \n')

# Bagging scoring
bagging_tuned_training_score = cross_val_score(bagging_tuned, X_train, y_train, cv = 5).mean()
bagging_tuned_testing_score = cross_val_score(bagging_tuned, X_test, y_test, cv = 5).mean()
print(f'Bagging tuned training score: {bagging_tuned_training_score}')
print(f'Bagging tuned testing score: {bagging_tuned_testing_score} \n')

# Pca scoring
print(f'Pca tuned training score: {round(lr_pca_tuned.score(Z_pca_tuned_train, y_train), 4)}')
print(f'Pca tuned testing score: {round(lr_pca_tuned.score(Z_pca_tuned_test, y_test), 4)}')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   13.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   11.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   12.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   12.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   11.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   10.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Rf tuned training score: 0.28968919736450094
Rf tuned testing score: -0.32810654809720485 

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.0min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.1min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.2min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.0min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.9min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.8min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.8min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Bagging tuned training score: 0.3067967000765127
Bagging tuned testing score: -0.26166997130372877 

Pca tuned training score: 0.5443
Pca tuned testing score: 0.2555


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.7min finished


Slight improvement on bagging and no improvement for random forest and pca - pca is the highest scoring testing model and the production model for this analysis. More detail to follow in technical write-up.