# Variable dependiente: Acceso a Educación Técnica (dicotómica)

## 1. Cargar data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as  pd, numpy as np
import variables as vb

In [3]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_0_tec.csv'
data_original = pd.read_csv( path )

In [4]:
data_original.shape

(875, 397)

## 2. Scale only numeric vars

In [5]:
# https://stackoverflow.com/questions/38420847/apply-standardscaler-to-parts-of-a-data-set

from sklearn.preprocessing import StandardScaler

data = data_original.copy()

numeric_vars = [col for col in data.columns if col in vb.num_vars ]

cols = data[ numeric_vars ]
scaler = StandardScaler().fit( cols.values )
cols = scaler.transform( cols.values )

data[ numeric_vars ] = cols

## 3. Split variables

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
dep_var = [ 'e_educacion_tec' ]
pred_vars = [col for col in data.columns if col not in vb.dep_vars and col not in dep_var ]
x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], data[ 'e_educacion_tec' ], test_size = 0.25 )

## 4. Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, f1_score, classification_report

In [9]:
%%time

# Implementing the model
lg_model = LogisticRegression( max_iter = 10000 ).fit( x_train, y_train )

# Predict over test set
y_lg_pred_class = lg_model.predict( x_test )
y_lg_pred_prob = lg_model.predict_proba( x_test )[ :, 1 ]

Wall time: 370 ms


In [10]:
# Calculating metrics
columns = [ 'no', 'tec' ]
lg_report = classification_report(y_test, y_lg_pred_class, target_names = columns, output_dict = True )

lg_no_precision = lg_report[ 'no' ][ 'precision' ]
lg_no_recall = lg_report[ 'no' ][ 'recall' ]
lg_no_f1_score = lg_report[ 'no' ][ 'f1-score' ]

lg_tec_precision = lg_report[ 'tec' ][ 'precision' ]
lg_tec_recall = lg_report[ 'tec' ][ 'recall' ]
lg_tec_f1_score = lg_report[ 'tec' ][ 'f1-score' ]

accuracy_lg = accuracy_score( y_test, y_lg_pred_class )
log_loss_lg = log_loss( y_test, y_lg_pred_class )
roc_auc_lg = roc_auc_score( y_test, y_lg_pred_prob )

In [11]:
print( classification_report(y_test, y_lg_pred_class, target_names = columns ) )

              precision    recall  f1-score   support

          no       0.82      0.83      0.83       179
         tec       0.21      0.20      0.21        40

    accuracy                           0.72       219
   macro avg       0.52      0.52      0.52       219
weighted avg       0.71      0.72      0.71       219



Info:
* prod_proba according to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

## 5. Regularization Methods (Lasso, Ridge and Elastic Net)

In [12]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

## 5.1. Lasso

In [13]:
%%time

# Implementing the model
lasso_model = LogisticRegressionCV( penalty = 'l1', solver = 'saga', cv = 10, random_state = 0, max_iter = 10000 ).\
                               fit( x_train, y_train )

# Predict over test set
y_lasso_pred_class = lasso_model.predict( x_test )
y_lasso_pred_prob = lasso_model.predict_proba( x_test )[ :, 1 ]

Wall time: 14min 16s


In [14]:
# Calculating metrics
columns = [ 'no', 'tec' ]
lasso_report = classification_report(y_test, y_lasso_pred_class, target_names = columns, output_dict = True )

lasso_no_precision = lasso_report[ 'no' ][ 'precision' ]
lasso_no_recall = lasso_report[ 'no' ][ 'recall' ]
lasso_no_f1_score = lasso_report[ 'no' ][ 'f1-score' ]

lasso_tec_precision = lasso_report[ 'tec' ][ 'precision' ]
lasso_tec_recall = lasso_report[ 'tec' ][ 'recall' ]
lasso_tec_f1_score = lasso_report[ 'tec' ][ 'f1-score' ]

accuracy_lasso = accuracy_score( y_test, y_lasso_pred_class )
log_loss_lasso = log_loss( y_test, y_lasso_pred_class )
roc_auc_lasso = roc_auc_score( y_test, y_lasso_pred_prob )

In [15]:
print( classification_report(y_test, y_lasso_pred_class, target_names = columns ) )

              precision    recall  f1-score   support

          no       0.82      1.00      0.90       179
         tec       0.00      0.00      0.00        40

    accuracy                           0.82       219
   macro avg       0.41      0.50      0.45       219
weighted avg       0.67      0.82      0.74       219



## 5.2. Ridge

In [16]:
%%time

# Implementing the model
ridge_model = LogisticRegressionCV( penalty = 'l2', solver = 'saga', cv = 10, random_state = 0, max_iter = 10000 ).\
                               fit( x_train, y_train )

# Predict over test set
y_ridge_pred_class = ridge_model.predict( x_test )
y_ridge_pred_prob = ridge_model.predict_proba( x_test )[ :, 1 ]

Wall time: 10min 40s


In [17]:
# Calculating metrics
columns = [ 'no', 'tec' ]
ridge_report = classification_report( y_test, y_ridge_pred_class, target_names = columns, output_dict = True )

ridge_no_precision = ridge_report[ 'no' ][ 'precision' ]
ridge_no_recall = ridge_report[ 'no' ][ 'recall' ]
ridge_no_f1_score = ridge_report[ 'no' ][ 'f1-score' ]

ridge_tec_precision = ridge_report[ 'tec' ][ 'precision' ]
ridge_tec_recall = ridge_report[ 'tec' ][ 'recall' ]
ridge_tec_f1_score = ridge_report[ 'tec' ][ 'f1-score' ]

accuracy_ridge = accuracy_score( y_test, y_ridge_pred_class )
log_loss_ridge = log_loss( y_test, y_ridge_pred_class )
roc_auc_ridge = roc_auc_score( y_test, y_ridge_pred_prob )

In [18]:
print( classification_report(y_test, y_ridge_pred_class, target_names = columns ) )

              precision    recall  f1-score   support

          no       0.82      1.00      0.90       179
         tec       0.00      0.00      0.00        40

    accuracy                           0.82       219
   macro avg       0.41      0.50      0.45       219
weighted avg       0.67      0.82      0.74       219



### 5.3. Elastic Net

In [19]:
%%time

# Implementing the model
elasticnet_model = LogisticRegressionCV( penalty = 'elasticnet', solver = 'saga', cv = 10, random_state = 0, l1_ratios = [ 0.5 ], max_iter = 10000 ).\
                                    fit( x_train, y_train )

# Predict over test set
y_elasticnet_pred_class = elasticnet_model.predict( x_test )
y_elasticnet_pred_prob = elasticnet_model.predict_proba( x_test )[ :, 1 ]

Wall time: 13min 53s


In [20]:
# Calculating metrics
columns = [ 'no', 'tec' ]
elasticnet_report = classification_report( y_test, y_elasticnet_pred_class, target_names = columns, output_dict = True )

elasticnet_no_precision = elasticnet_report[ 'no' ][ 'precision' ]
elasticnet_no_recall = elasticnet_report[ 'no' ][ 'recall' ]
elasticnet_no_f1_score = elasticnet_report[ 'no' ][ 'f1-score' ]

elasticnet_tec_precision = elasticnet_report[ 'tec' ][ 'precision' ]
elasticnet_tec_recall = elasticnet_report[ 'tec' ][ 'recall' ]
elasticnet_tec_f1_score = elasticnet_report[ 'tec' ][ 'f1-score' ]

accuracy_elasticnet = accuracy_score( y_test, y_elasticnet_pred_class )
log_loss_elasticnet = log_loss( y_test, y_elasticnet_pred_class )
roc_auc_elasticnet = roc_auc_score( y_test, y_elasticnet_pred_prob )

In [21]:
print( classification_report(y_test, y_elasticnet_pred_class, target_names = columns ) )

              precision    recall  f1-score   support

          no       0.82      1.00      0.90       179
         tec       0.00      0.00      0.00        40

    accuracy                           0.82       219
   macro avg       0.41      0.50      0.45       219
weighted avg       0.67      0.82      0.74       219



## 6. Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [23]:
%%time

# Set the model
rf_model = RandomForestClassifier( random_state = 0 )

# Define param grid
rf_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'max_features': [ 'auto', 'sqrt', 'log2' ]
}

# Grid search
rf_search = GridSearchCV( estimator = rf_model,
                          param_grid = rf_param_grid )

# Fit to data
rf_search.fit( x_train, y_train )

# Print best params and best score
print( rf_search.best_params_ )

# Select best params
rf_max_features = rf_search.best_params_[ 'max_features' ] 
rf_n_estimators = rf_search.best_params_[ 'n_estimators' ] 

{'max_features': 'log2', 'n_estimators': 500}
Wall time: 4min 38s


In [24]:
# Train the optimal model
rf_optimal_model = RandomForestClassifier( max_features = rf_max_features, 
                                           n_estimators = rf_n_estimators )
rf_optimal_model.fit( x_train, y_train )

# Apply over test set
y_rf_pred_class = rf_optimal_model.predict( x_test )
y_rf_pred_prob = rf_optimal_model.predict_proba( x_test )[ :, 1 ]

In [25]:
# Calculating metrics
columns = [ 'no', 'tec' ]
rf_report = classification_report(y_test, y_rf_pred_class, target_names = columns, output_dict = True )

rf_no_precision = rf_report[ 'no' ][ 'precision' ]
rf_no_recall = rf_report[ 'no' ][ 'recall' ]
rf_no_f1_score = rf_report[ 'no' ][ 'f1-score' ]

rf_tec_precision = rf_report[ 'tec' ][ 'precision' ]
rf_tec_recall = rf_report[ 'tec' ][ 'recall' ]
rf_tec_f1_score = rf_report[ 'tec' ][ 'f1-score' ]

accuracy_rf = accuracy_score( y_test, y_rf_pred_class )
log_loss_rf = log_loss( y_test, y_rf_pred_class )
roc_auc_rf = roc_auc_score( y_test, y_rf_pred_prob )

In [26]:
print( classification_report(y_test, y_rf_pred_class, target_names = columns ) )

              precision    recall  f1-score   support

          no       0.82      0.99      0.90       179
         tec       0.67      0.05      0.09        40

    accuracy                           0.82       219
   macro avg       0.75      0.52      0.50       219
weighted avg       0.80      0.82      0.75       219



## 7. Boosted Trees

In [27]:
from xgboost import XGBClassifier

In [28]:
%%time

# Set the model
xgb_model = XGBClassifier( use_label_encoder = False, objective = 'binary:logistic', verbosity = 0 )

# Define param grid
xgb_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'learning_rate': [0.1, 0.5, 1]
}

# Grid search
xgb_search = GridSearchCV( estimator = xgb_model,
                           param_grid = xgb_param_grid )

# Fit to data
xgb_search.fit( x_train, y_train )

# Print best params and best score
print( xgb_search.best_params_ )

# Select best params
xgb_learning_rate = xgb_search.best_params_[ 'learning_rate' ] 
xgb_n_estimators = xgb_search.best_params_[ 'n_estimators' ] 

{'learning_rate': 0.5, 'n_estimators': 500}
Wall time: 4min 39s


In [29]:
# Train the optimal model
xgb_optimal_model = XGBClassifier( objective = 'binary:logistic', 
                                   verbosity = 0,
                                   learning_rate = xgb_learning_rate, 
                                   n_estimators = xgb_n_estimators )
xgb_optimal_model.fit( x_train, y_train )

# Apply over test set
y_xgb_pred_class = xgb_optimal_model.predict( x_test )
y_xgb_pred_prob = xgb_optimal_model.predict_proba( x_test )[ :, 1 ]

In [30]:
# Calculating metrics
columns = [ 'no', 'tec' ]
xgb_report = classification_report(y_test, y_xgb_pred_class, target_names = columns, output_dict = True )

xgb_no_precision = xgb_report[ 'no' ][ 'precision' ]
xgb_no_recall = xgb_report[ 'no' ][ 'recall' ]
xgb_no_f1_score = xgb_report[ 'no' ][ 'f1-score' ]

xgb_tec_precision = xgb_report[ 'tec' ][ 'precision' ]
xgb_tec_recall = xgb_report[ 'tec' ][ 'recall' ]
xgb_tec_f1_score = xgb_report[ 'tec' ][ 'f1-score' ]

accuracy_xgb = accuracy_score( y_test, y_xgb_pred_class )
log_loss_xgb = log_loss( y_test, y_xgb_pred_class )
roc_auc_xgb = roc_auc_score( y_test, y_xgb_pred_prob )

## 7. Resultados

In [31]:
table = np.zeros( ( 6, 8 ) )

table[ 0 ] = [ accuracy_lg, roc_auc_lg, lg_no_precision, lg_no_recall, 
               lg_no_f1_score, lg_tec_precision, lg_tec_recall, lg_tec_f1_score ]

table[ 1 ] = [ accuracy_lasso, roc_auc_lasso, lasso_no_precision, lasso_no_recall, 
               lasso_no_f1_score, lasso_tec_precision, lasso_tec_recall, lasso_tec_f1_score ]

table[ 2 ] = [ accuracy_ridge, roc_auc_ridge, ridge_no_precision, ridge_no_recall, 
               ridge_no_f1_score, ridge_tec_precision, ridge_tec_recall, ridge_tec_f1_score ]

table[ 3 ] = [ accuracy_elasticnet, roc_auc_elasticnet, elasticnet_no_precision, elasticnet_no_recall, 
               elasticnet_no_f1_score, elasticnet_tec_precision, elasticnet_tec_recall, elasticnet_tec_f1_score ]

table[ 4 ] = [ accuracy_rf, roc_auc_rf, rf_no_precision, rf_no_recall, 
               rf_no_f1_score, rf_tec_precision, rf_tec_recall, rf_tec_f1_score ]

table[ 5 ] = [ accuracy_xgb, roc_auc_xgb, xgb_no_precision, xgb_no_recall, 
               xgb_no_f1_score, xgb_tec_precision, xgb_tec_recall, xgb_tec_f1_score ]

colnames_table = [ "Overall_Accuracy", "Roc_Auc", "Ninguna_Precision", "Ninguna_Recall",
                   "Ninguna_F1_Score", "Tec_Precision", "Tec_Recall", "Tec_F1_Score" ]
                  
rownames_table = [ "Logistic Regression", "Lasso",
                   "Ridge", "Elastic Net",
                   "Random Forest", "Boosted Trees" ]

table_pandas = pd.DataFrame( table, columns = colnames_table )
table_pandas.index = rownames_table

table_pandas = table_pandas.round(3)
table_pandas

Unnamed: 0,Overall_Accuracy,Roc_Auc,Ninguna_Precision,Ninguna_Recall,Ninguna_F1_Score,Tec_Precision,Tec_Recall,Tec_F1_Score
Logistic Regression,0.717,0.58,0.823,0.832,0.828,0.211,0.2,0.205
Lasso,0.817,0.5,0.817,1.0,0.899,0.0,0.0,0.0
Ridge,0.817,0.553,0.817,1.0,0.899,0.0,0.0,0.0
Elastic Net,0.817,0.5,0.817,1.0,0.899,0.0,0.0,0.0
Random Forest,0.822,0.556,0.824,0.994,0.901,0.667,0.05,0.093
Boosted Trees,0.795,0.524,0.822,0.955,0.884,0.273,0.075,0.118


## 8. Feature map

In [32]:
# Random Forest
fp_randomforest = pd.Series( rf_optimal_model.feature_importances_, index = pred_vars).\
                  sort_values( ascending = False )
fp_randomforest.head(10)

c_g05hd_       0.013945
e_p311t1_      0.013654
c_gru31hd_     0.013523
c_gru34hd_     0.013119
e_p311b_1_     0.013094
c_gru81hd_     0.013012
e_p311b_7_     0.012936
c_gashog2d_    0.012861
c_ingindhd_    0.012589
c_gru21hd_     0.012527
dtype: float64

In [33]:
# Boosted Trees
fp_xgboost = pd.Series( xgb_optimal_model.feature_importances_, index = pred_vars).\
           sort_values( ascending = False )
fp_xgboost.head(10)

e_p4025_       0.057144
j_p556t1_      0.045508
c_isecauhd_    0.040821
j_p558f_       0.032994
c_insedthd_    0.027649
c_paesechd_    0.025286
e_p4031_       0.023634
e_p40314_      0.023363
e_p558c_       0.023209
e_p312t1_      0.023059
dtype: float32