# Variable dependiente: Acceso a Educación Superior (dicotómica)

## 1. Cargar data

In [1]:
import pandas as  pd, numpy as np
import variables as vb

In [2]:
# Cargar datos
path = r'..\..\output\data_preprocess\dfs_1_sup.csv'
data_original = pd.read_csv( path )

In [3]:
data_original.shape

(392, 285)

## 2. Scale only numeric vars

In [4]:
# https://stackoverflow.com/questions/38420847/apply-standardscaler-to-parts-of-a-data-set

from sklearn.preprocessing import StandardScaler

data = data_original.copy()

numeric_vars = [col for col in data.columns if col in vb.num_vars ]

cols = data[ numeric_vars ]
scaler = StandardScaler().fit( cols.values )
cols = scaler.transform( cols.values )

data[ numeric_vars ] = cols

## 3. Split variables

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
dep_var = [ 'e_educacion_sup' ]
pred_vars = [col for col in data.columns if col not in vb.dep_vars and col not in dep_var ]
x_train, x_test, y_train, y_test = train_test_split( data[ pred_vars ], data[ 'e_educacion_sup' ], test_size = 0.20 )

## 4. Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score

In [8]:
%%time

# Implementing the model
lg_model = LogisticRegression( max_iter = 10000 ).fit( x_train, y_train )

# Predict over test set
y_lg_pred = lg_model.predict( x_test )

# Calculating metrics
accuracy_lg = accuracy_score( y_test, y_lg_pred )
log_loss_lg = log_loss( y_test, y_lg_pred )
roc_auc_lg = roc_auc_score( y_test, y_lg_pred )

Wall time: 114 ms


## 5. Regularization Methods (Lasso, Ridge and Elastic Net)

In [9]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

## 5.1. Lasso

In [10]:
%%time

# Implementing the model
lasso_model = LogisticRegressionCV( penalty = 'l1', solver = 'saga', cv = 10, random_state = 0, max_iter = 10000 ).\
                               fit( x_train, y_train )

# Predict over test set
y_lasso_pred = lasso_model.predict( x_test )

# Calculating metrics
accuracy_lasso = accuracy_score( y_test, y_lasso_pred )
log_loss_lasso = log_loss( y_test, y_lasso_pred )
roc_auc_lasso = roc_auc_score( y_test, y_lasso_pred )

Wall time: 2min


## 5.2. Ridge

In [11]:
%%time

# Implementing the model
ridge_model = LogisticRegressionCV( penalty = 'l2', solver = 'saga', cv = 10, random_state = 0, max_iter = 10000 ).\
                               fit( x_train, y_train )

# Predict over test set
y_ridge_pred = ridge_model.predict( x_test )

# Calculating metrics
accuracy_ridge = accuracy_score( y_test, y_ridge_pred )
log_loss_ridge = log_loss( y_test, y_ridge_pred )
roc_auc_ridge = roc_auc_score( y_test, y_ridge_pred )

Wall time: 1min 21s


### 5.3. Elastic Net

In [12]:
%%time

# Implementing the model
elasticnet_model = LogisticRegressionCV( penalty = 'elasticnet', solver = 'saga', cv = 10, random_state = 0, l1_ratios = [ 0.5 ], max_iter = 10000 ).\
                                    fit( x_train, y_train )

# Predict over test set
y_elasticnet_pred = elasticnet_model.predict( x_test )

# Calculating metrics
accuracy_elasticnet = accuracy_score( y_test, y_elasticnet_pred )
log_loss_elasticnet = log_loss( y_test, y_elasticnet_pred )
roc_auc_elasticnet = roc_auc_score( y_test, y_elasticnet_pred )

Wall time: 1min 52s


In [13]:
roc_auc_elasticnet = roc_auc_score( y_test, y_elasticnet_pred )

## 6. Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [15]:
%%time

# Set the model
rf_model = RandomForestClassifier( random_state = 0 )

# Define param grid
rf_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'max_features': [ 'auto', 'sqrt', 'log2' ]
}

# Grid search
rf_search = GridSearchCV( estimator = rf_model,
                          param_grid = rf_param_grid )

# Fit to data
rf_search.fit( x_train, y_train )

# Print best params and best score
print( rf_search.best_params_ )

{'max_features': 'auto', 'n_estimators': 500}
Wall time: 1min 35s


In [23]:
# Train the optimal model
rf_optimal_model = RandomForestClassifier( max_features = 'auto', 
                                           n_estimators =  500 )
rf_optimal_model.fit( x_train, y_train )

# Apply over test set
y_rf_pred = rf_optimal_model.predict( x_test )

# Guardar indicadores
accuracy_random_forest = accuracy_score( y_test, y_rf_pred )
log_loss_random_forest = log_loss( y_test, y_rf_pred )
roc_auc_random_forest = roc_auc_score( y_test, y_rf_pred )

## 7. Boosted Trees

In [17]:
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [24]:
%%time

# Set the model
xgb_model = XGBClassifier( use_label_encoder = False, objective = 'binary:logistic', verbosity = 0 )

# Define param grid
xgb_param_grid = {
    'n_estimators': [ 500, 1000, 2000 ],
    'learning_rate': [0.1, 0.5, 1]
}

# Grid search
xgb_search = GridSearchCV( estimator = xgb_model,
                           param_grid = xgb_param_grid )

# Fit to data
xgb_search.fit( x_train, y_train )

# Print best params and best score
print( xgb_search.best_params_ )

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

{'learning_rate': 0.1, 'n_estimators': 2000}
Wall time: 1min 24s


In [25]:
# Train the optimal model
xgb_optimal_model = XGBClassifier( objective = 'binary:logistic', 
                                   verbosity = 0,
                                   learning_rate = 0.1, 
                                   n_estimators = 2000 )
xgb_optimal_model.fit( x_train, y_train )

# Apply over test set
y_xgb_pred = xgb_optimal_model.predict( x_test )

# Guardar indicadores
accuracy_xgboost = accuracy_score( y_test, y_xgb_pred )
log_loss_xgboost = log_loss( y_test, y_xgb_pred )
roc_auc_xgboost = roc_auc_score( y_test, y_xgb_pred )



## 7. Resultados

In [26]:
table = np.zeros( (6, 3) )

table[ 0 ] = [ accuracy_lg, log_loss_lg, roc_auc_lg ]
table[ 1 ] = [ accuracy_lasso, log_loss_lasso, roc_auc_lasso ]
table[ 2 ] = [ accuracy_ridge, log_loss_ridge, roc_auc_ridge ]
table[ 3 ] = [ accuracy_elasticnet, log_loss_elasticnet, roc_auc_elasticnet ]
table[ 4 ] = [ accuracy_random_forest, log_loss_random_forest, roc_auc_random_forest ]
table[ 5 ] = [ accuracy_xgboost, log_loss_xgboost, roc_auc_xgboost ]

colnames_table = [ "Acccuracy_Score", "Log_Loss", "Roc_Auc" ]
rownames_table = [ "Logistic Regression", "Lasso",
                   "Ridge", "Elastic Net",
                   "Random Forest", "Boosted Trees" ]

table_pandas = pd.DataFrame( table, columns = colnames_table )
table_pandas.index = rownames_table

table_pandas = table_pandas.round(3)
table_pandas

Unnamed: 0,Acccuracy_Score,Log_Loss,Roc_Auc
Logistic Regression,0.443,19.237,0.443
Lasso,0.595,13.991,0.565
Ridge,0.506,17.051,0.513
Elastic Net,0.582,14.428,0.579
Random Forest,0.557,15.302,0.55
Boosted Trees,0.481,17.925,0.476


## 8. Feature map

In [27]:
# Random Forest
fp_randomforest = pd.Series( rf_optimal_model.feature_importances_, index = pred_vars).\
                  sort_values( ascending = False )
fp_randomforest.head(10)

c_gru71hd_     0.027523
c_gru61hd_     0.024995
c_gru31hd_     0.024946
c_gru41hd_     0.023363
c_gru81hd_     0.021558
e_p311b_7_     0.019745
e_p311b_1_     0.019128
c_gru21hd_     0.019032
c_inghog2d_    0.018969
e_p311t1_      0.018703
dtype: float64

In [28]:
# Boosted Trees
fp_xgboost = pd.Series( xgb_optimal_model.feature_importances_, index = pred_vars).\
           sort_values( ascending = False )
fp_xgboost.head(10)

e_p314b_7_     0.042951
e_p547_        0.039522
j_p407g2_      0.037675
j_p5112_       0.035886
e_p414_09_     0.026080
c_insedlhd_    0.026014
e_p3121b_      0.023626
j_p5111_       0.022729
j_p558g5_      0.021999
e_p311a5_7_    0.021934
dtype: float32