# Variable dependiente: Monto Corrupción Amplia

## 1. Load data and Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as  pd, numpy as np

In [3]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_0_i_mca.csv'
data = pd.read_csv( path )

In [4]:
# Borrar columnas con missing values
data = data.dropna( axis = 1 )

In [5]:
data.shape

(697, 1059)

In [6]:
data

Unnamed: 0,tejgfun_ct05pgercon,dfgpimpiafun_ct05pgercon,devppimfun_ct05pgercon,dfgdevpiagfun_ct05opseg,devppimfun_ct05trab,tejgfun_ct05come,dfgpimpiafun_ct05come,dfgdevpiagfun_ct05turi,piagfun_ct05agro,pimgfun_ct05agro,...,tejgtotfun_f5viv,tdvgtotfun_f5viv,dfgpimpiatotfun_f5viv,devppimtotfun_f5viv,tipo_control,corrup_intensa,corrup_amplia,per_corrup1,per_corrup2,monto_corrup2
0,12.597107,0.089871,4.562103,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,2.0,1.0,1.0,1.791759,0.000000,15.851884
1,13.022700,0.229834,4.425361,0.000000,0.000000,0.000000,0.000000,0.006500,0.000000,0.000000,...,9.648660,9.648660,0.022878,4.230487,2.0,1.0,1.0,1.386294,0.000000,12.060607
2,12.158430,0.022234,4.444100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.699681,...,9.159152,9.159152,0.009500,4.615121,1.0,1.0,1.0,1.609438,0.000000,15.537098
3,14.141373,0.583070,4.384931,-0.057000,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.0,1.0,1.0,2.079442,0.000000,15.031823
4,0.000000,0.187666,4.526504,0.245655,4.510813,0.000000,0.000000,0.000000,9.680406,0.000000,...,0.000000,12.583653,0.659707,3.811629,2.0,1.0,1.0,2.197225,1.609438,12.206308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,16.263140,3.943425,4.414352,0.892159,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,12.402012,12.402012,-0.701920,4.615106,2.0,0.0,1.0,0.000000,1.098612,19.670970
693,14.255436,0.244669,4.458600,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.852970,...,0.000000,0.000000,0.000000,0.000000,2.0,0.0,1.0,0.000000,2.197225,13.223206
694,14.991867,-0.004951,4.301544,-0.164810,0.000000,15.505672,0.607571,0.000000,0.000000,0.000000,...,12.350079,12.350079,0.050389,4.613931,2.0,0.0,0.0,0.000000,0.000000,0.000000
695,16.122050,3.888989,4.479585,0.398834,0.000000,0.000000,0.000000,0.053501,0.000000,0.000000,...,14.455608,14.455183,2.545754,4.323754,2.0,1.0,1.0,2.833213,0.000000,16.768403


## 2. Split variables

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
dep_var = [ 'monto_corrup2' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio', 
               'monto_corrup1', 'corrup_intensa', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_' ]
pred_vars = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], data[ 'monto_corrup2' ], test_size = 0.25 )

## 3. Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

In [13]:
# Random Forest Regression
rf_model = RandomForestRegressor( random_state = 0 )

# Parameters 
params = {
    'n_estimators': [ 500, 1000, 2000 ],
    'min_samples_split': [ 400, 200 ],
    'min_samples_leaf': [ 50 ]}

# K-Fold Cross Validation
k = 5

rf_search = GridSearchCV( estimator = rf_model, 
                          param_grid = params, 
                          cv = k,
                          scoring = 'r2',
                          return_train_score = True,
                          n_jobs = 5, 
                          verbose = 10 )

# Fit to data
rf_search.fit( x_train, y_train )

# Print best params and best score
print( rf_search.best_params_ )

# Select best params
rf_n_estimators = rf_search.best_params_[ 'n_estimators' ] 
rf_min_samples_split = rf_search.best_params_[ 'min_samples_split' ] 
rf_min_samples_leaf = rf_search.best_params_[ 'min_samples_leaf' ] 

Fitting 5 folds for each of 6 candidates, totalling 30 fits
{'min_samples_leaf': 50, 'min_samples_split': 200, 'n_estimators': 2000}


In [15]:
# Train the optimal model
rf_optimal_model = RandomForestRegressor( n_estimators = rf_n_estimators,
                                          min_samples_split = rf_min_samples_split, 
                                          min_samples_leaf = rf_min_samples_leaf )
rf_optimal_model.fit( x_train, y_train )

# Apply over test set
y_rf_pred = rf_optimal_model.predict( x_test )

In [19]:
# Calculating metrics

r2_rf = r2_score( y_test, y_rf_pred )
mse_rf = mean_squared_error( y_test, y_rf_pred )
evs_rf = explained_variance_score( y_test, y_rf_pred )

In [23]:
r2_rf

-0.026002646631369197

In [24]:
mse_rf

34.657539788010126

In [25]:
evs_rf

-0.008000810963804561