## 1. Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_0_i_ca.csv'
data = pd.read_csv( path )

In [3]:
# Borrar otras predictoras
data = data.drop( [ 'monto_examinado', 'monto_objeto_servicio', 'monto_corrup2', 
                    'corrup_intensa', 'per_corrup1', 'per_corrup2' ], 
                axis = 1 )

In [4]:
# Borrar columnas con missing
data = data.dropna( axis = 1 )

## 2. Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, train_test_split

In [6]:
dep_var = ['corrup_amplia']
pred_vars = [col for col in data.columns if col not in dep_var]

In [7]:
# Split
x_train, x_test, y_train, y_test = train_test_split(data[pred_vars], data['corrup_amplia'], test_size = 0.3)

In [8]:
%%time

model = RandomForestClassifier()
param_grid = { 
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,3,4,5,6,7],
    'criterion' :['gini', 'entropy']
}

search = GridSearchCV(estimator=model,
                      param_grid=param_grid,
                      cv= 5,
                      n_jobs = 5,
                      verbose = 10)
search.fit(x_train, y_train)
print(search.best_params_)
print(search.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 50}
0.8947682752030579
Wall time: 1min 6s


In [9]:
# Mostrar los mejores parametros
search.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'sqrt',
 'n_estimators': 50}

In [10]:
# Entrenar el modelo optimo
modelo_optimo = RandomForestClassifier(max_features='auto', 
                                       n_estimators= 200, 
                                       max_depth = 7, 
                                       criterion = 'gini')
modelo_optimo.fit(x_train, y_train)

RandomForestClassifier(max_depth=7, n_estimators=200)

In [11]:
# Conjunto de prueba
y_pred_rf = modelo_optimo.predict(x_test)

In [12]:
# Guardar indicadores
accuracy_random_forest = accuracy_score(y_test, y_pred_rf)
log_loss_random_forest = log_loss(y_test, y_pred_rf)

In [13]:
# Ver las variables que más contribuyen

pd.DataFrame(modelo_optimo.feature_importances_, index=x_train.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
tejgge_r07ct05otgst,0.017821
tejgfun_f2ct05protsoc,0.012162
piagtotfun_f1pgercon,0.010204
tejgtotfun_f5r07opseg,0.009750
tejgfun_ct05protsoc,0.008754
...,...
piagfun_f5r08ct06energia,0.000000
dfgpimpiatotfun_f4edu,0.000000
devppimfun_f5r08ct06edu,0.000000
tejgtotfun_f4energia,0.000000


## 3. XGBoost

In [14]:
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [15]:
%%time

xgb = XGBClassifier(use_label_encoder=False)

params = {
      'n_estimators': [300, 200, 100],
      'learning_rate': [0.1, 0.5, 1],
      'max_depth': [5],
      'binary':['logistic']
}

xgb_grid_search_cv = GridSearchCV(estimator = xgb, 
                          param_grid = params, 
                          cv= 5,
                          n_jobs = 5,
                          scoring = 'roc_auc',
                          verbose = 10)

xgrid_model_result = xgb_grid_search_cv.fit(x_train, y_train) 
print(xgrid_model_result.best_params_)
print(xgrid_model_result.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "binary" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'binary': 'logistic', 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.7700112233445566
Wall time: 23.9 s


In [16]:
# Mostrar los mejores parametros
xgrid_model_result.best_params_

{'binary': 'logistic',
 'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100}

In [17]:
# Entrenar el modelo optimo
xgb_modelo_optimo = XGBClassifier(objective= 'binary:logistic', 
                                  learning_rate = 0.1, 
                                  max_depth = 5, 
                                  n_estimators = 300)
xgb_modelo_optimo.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
y_pred_xgb = xgb_modelo_optimo.predict(x_test)

In [19]:
accuracy_xgboost = accuracy_score(y_test, y_pred_xgb)
log_loss_xgboost = log_loss(y_test, y_pred_xgb)

## 4. Resultados

In [20]:
table = np.zeros( (2, 2) )

In [21]:
table[0,0] = accuracy_random_forest
table[1,0] = accuracy_xgboost

In [22]:
table[0,1] = log_loss_random_forest
table[1,1] = log_loss_xgboost

In [23]:
colnames_table = ["Acccuracy_Score", "Log_Loss"]
rownames_table = ["Random Forest", "XGBoost"]

table_pandas = pd.DataFrame( table, columns = colnames_table )
table_pandas.index = rownames_table

table_pandas = table_pandas.round(3)
table_pandas

Unnamed: 0,Acccuracy_Score,Log_Loss
Random Forest,0.898,3.524
XGBoost,0.893,3.701
