# Variable dependiente: Corrupción Intensa (dicotómica)

## 1. Load data and Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as  pd, numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_0_i_base3_a.csv'
data = pd.read_csv( path )

In [4]:
# Borrar columnas con missing values
data = data.dropna( axis = 1 )

In [5]:
data.shape

(1969, 4697)

In [6]:
data[ 'corrup_intensa' ].value_counts( normalize = True )

1.0    0.576943
0.0    0.423057
Name: corrup_intensa, dtype: float64

## 2. Split variables

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
dep_var = [ 'corrup_intensa' ]
other_vars = [ 'monto_examinado', 'monto_auditado', 'monto_objeto_servicio', 
               'monto_corrup1', 'monto_corrup2', 'tipo_control', 'corrup_amplia',
               'per_corrup1', 'per_corrup2', '_monto', 'monto_' ]
pred_vars = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], data[ 'corrup_intensa' ], test_size = 0.25 )

## 3. Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score
from sklearn.metrics import classification_report

In [10]:
%%time

# Implementing the model
lg_model = LogisticRegression().fit( x_train, y_train )

# Predict over test set
y_lg_pred_class = lg_model.predict( x_test )
y_lg_pred_prob = lg_model.predict_proba( x_test )[ :, 1 ]

Wall time: 1.42 s


In [11]:
# Calculating metrics
columns = [ 'no', 'si' ]
lg_report = classification_report(y_test, y_lg_pred_class, target_names = columns, output_dict = True )

lg_no_precision = lg_report[ 'no' ][ 'precision' ]
lg_no_recall = lg_report[ 'no' ][ 'recall' ]
lg_no_f1_score = lg_report[ 'no' ][ 'f1-score' ]

lg_si_precision = lg_report[ 'si' ][ 'precision' ]
lg_si_recall = lg_report[ 'si' ][ 'recall' ]
lg_si_f1_score = lg_report[ 'si' ][ 'f1-score' ]

accuracy_lg = accuracy_score( y_test, y_lg_pred_class )
log_loss_lg = log_loss( y_test, y_lg_pred_class )
roc_auc_lg = roc_auc_score( y_test, y_lg_pred_prob )

In [12]:
accuracy_lg

0.6531440162271805

## 4. Regularization Methods (Lasso, Ridge and Elastic Net)

In [13]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

## 4.1. Lasso

In [14]:
%%time

# Implementing the model
lasso_model = LogisticRegressionCV( penalty = 'l1', solver = 'saga', cv = 10, random_state = 0 ).fit( x_train, y_train )

# Predict over test set
y_lasso_pred_class = lasso_model.predict( x_test )
y_lasso_pred_prob = lasso_model.predict_proba( x_test )[ :, 1 ]

Wall time: 36min 53s


In [15]:
# Calculating metrics
columns = [ 'no', 'si' ]
lasso_report = classification_report(y_test, y_lasso_pred_class, target_names = columns, output_dict = True )

lasso_no_precision = lasso_report[ 'no' ][ 'precision' ]
lasso_no_recall = lasso_report[ 'no' ][ 'recall' ]
lasso_no_f1_score = lasso_report[ 'no' ][ 'f1-score' ]

lasso_si_precision = lasso_report[ 'si' ][ 'precision' ]
lasso_si_recall = lasso_report[ 'si' ][ 'recall' ]
lasso_si_f1_score = lasso_report[ 'si' ][ 'f1-score' ]

accuracy_lasso = accuracy_score( y_test, y_lasso_pred_class )
log_loss_lasso = log_loss( y_test, y_lasso_pred_class )
roc_auc_lasso = roc_auc_score( y_test, y_lasso_pred_prob )

## 4.2. Ridge

In [16]:
%%time

# Implementing the model
ridge_model = LogisticRegressionCV( penalty = 'l2', solver = 'saga', cv = 10, random_state = 0 ).fit( x_train, y_train )

# Predict over test set
y_ridge_pred_class = ridge_model.predict( x_test )
y_ridge_pred_prob = ridge_model.predict_proba( x_test )[ :, 1 ]

Wall time: 29min 11s


In [17]:
# Calculating metrics
columns = [ 'no', 'si' ]
ridge_report = classification_report( y_test, y_ridge_pred_class, target_names = columns, output_dict = True )

ridge_no_precision = ridge_report[ 'no' ][ 'precision' ]
ridge_no_recall = ridge_report[ 'no' ][ 'recall' ]
ridge_no_f1_score = ridge_report[ 'no' ][ 'f1-score' ]

ridge_si_precision = ridge_report[ 'si' ][ 'precision' ]
ridge_si_recall = ridge_report[ 'si' ][ 'recall' ]
ridge_si_f1_score = ridge_report[ 'si' ][ 'f1-score' ]

accuracy_ridge = accuracy_score( y_test, y_ridge_pred_class )
log_loss_ridge = log_loss( y_test, y_ridge_pred_class )
roc_auc_ridge = roc_auc_score( y_test, y_ridge_pred_prob )

### 4.3. Elastic Net

In [18]:
%%time

# Implementing the model
elasticnet_model = LogisticRegressionCV( penalty = 'elasticnet', solver = 'saga', cv = 10, random_state = 0, l1_ratios = [ 0.5 ] ).\
                                  fit( x_train, y_train )

# Predict over test set
y_elasticnet_pred_class = elasticnet_model.predict( x_test )
y_elasticnet_pred_prob = elasticnet_model.predict_proba( x_test )[ :, 1 ]

Wall time: 32min 24s


In [19]:
# Calculating metrics
columns = [ 'no', 'si' ]
elasticnet_report = classification_report( y_test, y_elasticnet_pred_class, target_names = columns, output_dict = True )

elasticnet_no_precision = elasticnet_report[ 'no' ][ 'precision' ]
elasticnet_no_recall = elasticnet_report[ 'no' ][ 'recall' ]
elasticnet_no_f1_score = elasticnet_report[ 'no' ][ 'f1-score' ]

elasticnet_si_precision = elasticnet_report[ 'si' ][ 'precision' ]
elasticnet_si_recall = elasticnet_report[ 'si' ][ 'recall' ]
elasticnet_si_f1_score = elasticnet_report[ 'si' ][ 'f1-score' ]

accuracy_elasticnet = accuracy_score( y_test, y_elasticnet_pred_class )
log_loss_elasticnet = log_loss( y_test, y_elasticnet_pred_class )
roc_auc_elasticnet = roc_auc_score( y_test, y_elasticnet_pred_prob )

## 5. Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
%%time

# Set the model
rf_model = RandomForestClassifier( random_state = 0 )

# Define param grid
rf_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'max_features': [ 'auto', 'sqrt', 'log2' ]
}

# Grid search
rf_search = GridSearchCV( estimator = rf_model,
                          param_grid = rf_param_grid )

# Fit to data
rf_search.fit( x_train, y_train )

# Print best params and best score
print( rf_search.best_params_ )

# Select best params
rf_max_features = rf_search.best_params_[ 'max_features' ] 
rf_n_estimators = rf_search.best_params_[ 'n_estimators' ] 

{'max_features': 'auto', 'n_estimators': 2000}
Wall time: 45min 51s


In [22]:
# Train the optimal model
rf_optimal_model = RandomForestClassifier( max_features = rf_max_features, 
                                           n_estimators = rf_n_estimators )
rf_optimal_model.fit( x_train, y_train )

# Apply over test set
y_rf_pred_class = rf_optimal_model.predict( x_test )
y_rf_pred_prob = rf_optimal_model.predict_proba( x_test )[ :, 1 ]

In [23]:
# Calculating metrics
columns = [ 'no', 'si' ]
rf_report = classification_report(y_test, y_rf_pred_class, target_names = columns, output_dict = True )

rf_no_precision = rf_report[ 'no' ][ 'precision' ]
rf_no_recall = rf_report[ 'no' ][ 'recall' ]
rf_no_f1_score = rf_report[ 'no' ][ 'f1-score' ]

rf_si_precision = rf_report[ 'si' ][ 'precision' ]
rf_si_recall = rf_report[ 'si' ][ 'recall' ]
rf_si_f1_score = rf_report[ 'si' ][ 'f1-score' ]

accuracy_rf = accuracy_score( y_test, y_rf_pred_class )
log_loss_rf = log_loss( y_test, y_rf_pred_class )
roc_auc_rf = roc_auc_score( y_test, y_rf_pred_prob )

## 6. Boosted Trees

In [30]:
from xgboost import XGBClassifier

In [31]:
%%time

# Set the model
xgb_model = XGBClassifier( use_label_encoder = False, objective = 'binary:logistic', verbosity = 0 )

# Define param grid
xgb_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'learning_rate': [0.1, 0.5, 1]
}

# Grid search
xgb_search = GridSearchCV( estimator = xgb_model,
                           param_grid = xgb_param_grid )

# Fit to data
xgb_search.fit( x_train, y_train )

# Print best params and best score
print( xgb_search.best_params_ )

# Select best params
xgb_learning_rate = xgb_search.best_params_[ 'learning_rate' ] 
xgb_n_estimators = xgb_search.best_params_[ 'n_estimators' ] 

KeyboardInterrupt: 

In [32]:
# Train the optimal model
xgb_optimal_model = XGBClassifier( objective = 'binary:logistic', 
                                   verbosity = 0,
                                   learning_rate = xgb_learning_rate, 
                                   n_estimators = xgb_n_estimators )
xgb_optimal_model.fit( x_train, y_train )

# Apply over test set
y_xgb_pred_class = xgb_optimal_model.predict( x_test )
y_xgb_pred_prob = xgb_optimal_model.predict_proba( x_test )[ :, 1 ]

NameError: name 'xgb_learning_rate' is not defined

In [None]:
# Calculating metrics
columns = [ 'no', 'si' ]
xgb_report = classification_report(y_test, y_xgb_pred_class, target_names = columns, output_dict = True )

xgb_no_precision = xgb_report[ 'no' ][ 'precision' ]
xgb_no_recall = xgb_report[ 'no' ][ 'recall' ]
xgb_no_f1_score = xgb_report[ 'no' ][ 'f1-score' ]

xgb_si_precision = xgb_report[ 'si' ][ 'precision' ]
xgb_si_recall = xgb_report[ 'si' ][ 'recall' ]
xgb_si_f1_score = xgb_report[ 'si' ][ 'f1-score' ]

accuracy_xgb = accuracy_score( y_test, y_xgb_pred_class )
log_loss_xgb = log_loss( y_test, y_xgb_pred_class )
roc_auc_xgb = roc_auc_score( y_test, y_xgb_pred_prob )

## 4. Resultados

In [28]:
table = np.zeros( ( 5, 8 ) )
# table = np.zeros( ( 6, 8 ) )

table[ 0 ] = [ accuracy_lg, roc_auc_lg, lg_no_precision, lg_no_recall, 
               lg_no_f1_score, lg_si_precision, lg_si_recall, lg_si_f1_score ]

table[ 1 ] = [ accuracy_lasso, roc_auc_lasso, lasso_no_precision, lasso_no_recall, 
               lasso_no_f1_score, lasso_si_precision, lasso_si_recall, lasso_si_f1_score ]

table[ 2 ] = [ accuracy_ridge, roc_auc_ridge, ridge_no_precision, ridge_no_recall, 
               ridge_no_f1_score, ridge_si_precision, ridge_si_recall, ridge_si_f1_score ]

table[ 3 ] = [ accuracy_elasticnet, roc_auc_elasticnet, elasticnet_no_precision, elasticnet_no_recall, 
               elasticnet_no_f1_score, elasticnet_si_precision, elasticnet_si_recall, elasticnet_si_f1_score ]

table[ 4 ] = [ accuracy_rf, roc_auc_rf, rf_no_precision, rf_no_recall, 
               rf_no_f1_score, rf_si_precision, rf_si_recall, rf_si_f1_score ]

# table[ 5 ] = [ accuracy_xgb, roc_auc_xgb, xgb_no_precision, xgb_no_recall, 
               # xgb_no_f1_score, xgb_si_precision, xgb_si_recall, xgb_si_f1_score ]

colnames_table = [ "Overall_Accuracy", "Roc_Auc", "No_Precision", "No_Recall",
                   "No_F1_Score", "Si_Precision", "Si_Recall", "Si_F1_Score" ]
                  
rownames_table = [ "Logistic Regression", "Lasso",
                   "Ridge", "Elastic Net",
                   "Random Forest" ]

# rownames_table = [ "Logistic Regression", "Lasso",
                   # "Ridge", "Elastic Net",
                   # "Random Forest", "Boosted Trees" ]

table_pandas = pd.DataFrame( table, columns = colnames_table )
table_pandas.index = rownames_table

table_pandas = table_pandas.round(3)
table_pandas

Unnamed: 0,Overall_Accuracy,Roc_Auc,No_Precision,No_Recall,No_F1_Score,Si_Precision,Si_Recall,Si_F1_Score
Logistic Regression,0.653,0.719,0.606,0.575,0.59,0.686,0.713,0.699
Lasso,0.688,0.751,0.66,0.579,0.617,0.705,0.771,0.736
Ridge,0.696,0.758,0.695,0.533,0.603,0.696,0.821,0.753
Elastic Net,0.688,0.74,0.688,0.514,0.588,0.688,0.821,0.748
Random Forest,0.712,0.726,0.714,0.561,0.628,0.711,0.828,0.765


## 8. Feature Map

In [29]:
# Random Forest
fp_randomforest = pd.Series( rf_optimal_model.feature_importances_, index = pred_vars).\
                  sort_values( ascending = False )
fp_randomforest.head(10)

pimgge_r18ct05pobso    0.002336
piagge_r18ct05pobso    0.001908
pimgct_r09gstcr        0.001823
pimgge_r09ct05biser    0.001816
tdvgge_r18ct05pobso    0.001773
piagge_r09ct05biser    0.001758
tdvgfun_ct05opseg      0.001750
pimgft_redr            0.001742
pimgrb_redr            0.001739
piagft_redr            0.001657
dtype: float64

In [None]:
# Boosted Trees
fp_xgboost = pd.Series( xgb_optimal_model.feature_importances_, index = pred_vars).\
           sort_values( ascending = False )
fp_xgboost.head(10)