# Variable dependiente: Corrupción Amplia (dicotómica)

## 1. Load data and Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd, numpy as np

In [3]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_0_i_ca.csv'
data = pd.read_csv( path )

In [4]:
# Borrar columnas con missing values
data = data.dropna( axis = 1 )

In [70]:
data[ 'corrup_amplia' ].value_counts()

1.0    582
0.0     70
Name: corrup_amplia, dtype: int64

## 2. Split variables

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
dep_var = [ 'corrup_amplia' ]
other_vars = [ 'monto_examinado', 'monto_objeto_servicio', 
               'corrup_intensa', 'per_corrup1', 'per_corrup2' ]
pred_vars = [ col for col in data.columns if col not in dep_var and col not in other_vars ]

x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], data[ 'corrup_amplia' ], test_size = 0.25 )

## 3. Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

In [8]:
%%time

# Implementing the model
lg_model = LogisticRegression().fit( x_train, y_train )

# Predict over test set
y_lg_pred = lg_model.predict( x_test )

# Calculating metrics
accuracy_lg = accuracy_score( y_test, y_lg_pred )
log_loss_lg = log_loss( y_test, y_lg_pred )

Wall time: 50.1 ms


## 4. Regularization Methods (Lasso, Ridge and Elastic Net)

In [9]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

## 4.1. Lasso

In [10]:
%%time

# Implementing the model
lasso_model = LogisticRegressionCV( penalty = 'l1', solver = 'saga', cv = 10, random_state = 0 ).fit( x_train, y_train )

# Predict over test set
y_lasso_pred = lasso_model.predict( x_test )

# Calculating metrics
accuracy_lasso = accuracy_score( y_test, y_lasso_pred )
log_loss_lasso = log_loss( y_test, y_lasso_pred )

Wall time: 46.9 s


## 4.2. Ridge

In [11]:
%%time

# Implementing the model
ridge_model = LogisticRegressionCV( penalty = 'l2', solver = 'saga', cv = 10, random_state = 0 ).fit( x_train, y_train )

# Predict over test set
y_ridge_pred = ridge_model.predict( x_test )

# Calculating metrics
accuracy_ridge = accuracy_score( y_test, y_ridge_pred )
log_loss_ridge = log_loss( y_test, y_ridge_pred )

Wall time: 33.8 s


### 4.3. Elastic Net

In [12]:
%%time

# Implementing the model
elasticnet_model = LogisticRegressionCV( penalty = 'elasticnet', solver = 'saga', cv = 10, random_state = 0, l1_ratios = [ 0.5 ] ).\
                                  fit( x_train, y_train )

# Predict over test set
y_elasticnet_pred = elasticnet_model.predict( x_test )

# Calculating metrics
accuracy_elasticnet = accuracy_score( y_test, y_elasticnet_pred )
log_loss_elasticnet = log_loss( y_test, y_elasticnet_pred )

Wall time: 40.8 s


## 5. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [14]:
%%time

# Set the model
rf_model = RandomForestClassifier( random_state = 0 )

# Define param grid
rf_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'max_features': [ 'auto', 'sqrt', 'log2' ]
}

# Grid search
rf_search = GridSearchCV( estimator = rf_model,
                          param_grid = rf_param_grid )

# Fit to data
rf_search.fit( x_train, y_train )

# Print best params and best score
print( rf_search.best_params_ )

{'max_features': 'log2', 'n_estimators': 2000}
Wall time: 2min 6s


In [62]:
# Train the optimal model
rf_optimal_model = RandomForestClassifier( max_features = 'log2', 
                                           n_estimators =  2000 )
rf_optimal_model.fit( x_train, y_train )

# Apply over test set
y_rf_pred = rf_optimal_model.predict( x_test )

# Guardar indicadores
accuracy_random_forest = accuracy_score( y_test, y_rf_pred )
log_loss_random_forest = log_loss( y_test, y_rf_pred )

## 6. Boosted Trees

In [16]:
from xgboost import XGBClassifier

In [24]:
%%time

# Set the model
xgb_model = XGBClassifier( use_label_encoder = False, objective = 'binary:logistic', verbosity = 0 )

# Define param grid
xgb_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'learning_rate': [0.1, 0.5, 1]
}

# Grid search
xgb_search = GridSearchCV( estimator = xgb_model,
                           param_grid = xgb_param_grid )

# Fit to data
xgb_search.fit( x_train, y_train )

# Print best params and best score
print( xgb_search.best_params_ )

{'learning_rate': 0.5, 'n_estimators': 500}
Wall time: 3min 50s


In [38]:
# Train the optimal model
xgb_optimal_model = XGBClassifier( objective = 'binary:logistic', 
                                   verbosity = 0,
                                   learning_rate = 0.5, 
                                   n_estimators = 3000 )
xgb_optimal_model.fit( x_train, y_train )

# Apply over test set
y_xgb_pred = xgb_optimal_model.predict( x_test )

# Guardar indicadores
accuracy_xgboost = accuracy_score( y_test, y_xgb_pred )
log_loss_xgboost = log_loss( y_test, y_xgb_pred )

## 4. Resultados

In [29]:
table = np.zeros( (6, 2) )

In [31]:
table[ 0 ] = [ accuracy_lg, log_loss_lg ]
table[ 1 ] = [ accuracy_lasso, log_loss_lasso ]
table[ 2 ] = [ accuracy_ridge, log_loss_ridge ]
table[ 3 ] = [ accuracy_elasticnet, log_loss_elasticnet ]
table[ 4 ] = [ accuracy_random_forest, log_loss_random_forest ]
table[ 5 ] = [ accuracy_xgboost, log_loss_xgboost ]

colnames_table = [ "Acccuracy_Score", "Log_Loss" ]
rownames_table = [ "Logistic Regression", "Lasso",
                   "Ridge", "Elastic Net",
                   "Random Forest", "Boosted Trees" ]

table_pandas = pd.DataFrame( table, columns = colnames_table )
table_pandas.index = rownames_table

table_pandas = table_pandas.round(3)
table_pandas

Unnamed: 0,Acccuracy_Score,Log_Loss
Logistic Regression,0.822,6.145
Lasso,0.865,4.662
Ridge,0.865,4.662
Elastic Net,0.865,4.662
Random Forest,0.871,4.45
Boosted Trees,0.871,4.45
