En esta notebook entrenaremos y evaluaremos diferentes modelos para predecir la variable target "survived"

In [1]:
# importo todas las librerías necesarias
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pickle

In [2]:
data='./db/titanic/train_clean.csv'
train = pd.read_csv(data ,sep='\t', encoding='utf-8')
train.head(10)

Unnamed: 0.1,Unnamed: 0,survived,age,fare,pclass_2,pclass_3,sex_male,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,parch_1,parch_2,parch_3,parch_4,parch_5,parch_6,embarked_Q,embarked_S
0,0,0,22.0,7.25,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1,26.0,7.925,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,1,35.0,53.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,4,0,35.0,8.05,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
5,6,0,54.0,51.8625,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
6,7,0,2.0,21.075,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,1
7,8,1,27.0,11.1333,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
8,9,1,14.0,30.0708,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9,10,1,4.0,16.7,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1


Ahora separamos en train y test el df_train para entrenar y evaluar modelos.

In [3]:
train.columns

Index(['Unnamed: 0', 'survived', 'age', 'fare', 'pclass_2', 'pclass_3',
       'sex_male', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5',
       'parch_1', 'parch_2', 'parch_3', 'parch_4', 'parch_5', 'parch_6',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [4]:
X = train.drop(columns=['Unnamed: 0', 'survived'],axis=1)
y = train.survived

In [5]:
y.value_counts(normalize=True)

0    0.596343
1    0.403657
Name: survived, dtype: float64

Vemos que hay una pequeño desbalanceo de clases, por lo que cuando separamos en train y test lo hacemos con un stratify para compenzar esa diferencia.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, random_state=42)

Comenzamos con un modelo de regresión logística.
Utilizamos skalearn para standarizar la matriz de featues.

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
clf = LogisticRegression(C=1e10)

In [57]:
params = {'C': [1, 10, 100, 1000],
     'penalty': ['l1', 'l2','None','elasticnet'],
     'solver': ['saga'],
     'max_iter':[1,10,100,1000]}
folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [58]:
grid_clf = GridSearchCV(estimator=clf, param_grid=params, scoring='accuracy', cv=folds, n_jobs=-1)

In [59]:
fit_clf = grid_clf.fit(X_train,y_train)

320 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 441, in _check_solver
    raise ValueError(
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got No

In [12]:
fit_clf.best_params_

{'C': 10, 'penalty': 'l1', 'solver': 'saga'}

In [13]:
fit_clf.best_score_

0.8049266247379455

In [14]:
y_predict = fit_clf.predict(X_test)

Evaluamos al modelo.

In [15]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       106
           1       0.74      0.78      0.76        72

    accuracy                           0.80       178
   macro avg       0.79      0.79      0.79       178
weighted avg       0.80      0.80      0.80       178



In [16]:
confusion_matrix(y_test, y_predict)

array([[86, 20],
       [16, 56]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [17]:
model_clf = fit_clf.best_estimator_
# save
with open('./models/model_clf.pkl','wb') as f:
    pickle.dump(model_clf,f)

Ahora probaremos un modelo de k neighborns

In [18]:
k_params = {'n_neighbors': range(1,200),
     'weights' : ['uniform', 'distance'],
     'p' : [1, 2, 3]}


In [19]:
kneig = KNeighborsClassifier()

In [20]:
grid_kneig = GridSearchCV(estimator=kneig, param_grid=k_params, scoring='accuracy', cv=folds, n_jobs=-1)

In [21]:
fit_kn = grid_kneig.fit(X_train,y_train)

In [22]:
fit_kn.best_score_

0.8141509433962264

In [23]:
y_predict = fit_kn.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Evaluamos al modelo.

In [24]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.75      0.90      0.82       106
           1       0.78      0.56      0.65        72

    accuracy                           0.76       178
   macro avg       0.77      0.73      0.73       178
weighted avg       0.76      0.76      0.75       178



In [25]:
confusion_matrix(y_test, y_predict)

array([[95, 11],
       [32, 40]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [26]:
model_kn = fit_kn.best_estimator_
# save
with open('./models/model_kn.pkl','wb') as f:
    pickle.dump(model_kn,f)

Ahora probaremos con modelos de clasificación (árboles)

In [27]:
num_leafs = [1, 5, 10, 20, 50, 100]
depths = np.arange(1, 21)
criterion = ['gini','entropy']

In [28]:
param = [{'criterion' :criterion,'max_depth':depths,
              'min_samples_leaf':num_leafs}]

In [29]:

tree_model = tree.DecisionTreeClassifier(random_state=1)

In [30]:
gs = GridSearchCV(estimator = tree_model, param_grid=param, scoring='accuracy', cv=folds,n_jobs=-1)

In [31]:
fit_tree = gs.fit(X_train,y_train)

In [32]:
gs.best_score_

0.8238294898672258

In [33]:
y_predict = gs.predict(X_test)

Evaluamos al modelo.

In [34]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       106
           1       0.78      0.69      0.74        72

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.79       178
weighted avg       0.80      0.80      0.80       178



In [35]:
confusion_matrix(y_test, y_predict)

array([[92, 14],
       [22, 50]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [36]:
model_tree = fit_tree.best_estimator_
# save
with open('./models/model_tree.pkl','wb') as f:
    pickle.dump(model_tree,f)

XGBBoost

In [37]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [38]:
from xgboost.sklearn import XGBClassifier
model_xg = XGBClassifier(n_jobs=-1, use_label_encoder=False)



In [39]:
params_xgboost = {  
    "n_estimators": [50,100,500],      # Number of boosted trees to fit.
    "max_depth": [1,2,3,4],            # Maximum tree depth for base learners.
    "learning_rate": [0.01, 0.1],      # Boosting learning rate (xgb’s “eta”)
}

In [40]:
xgb = GridSearchCV(model_xg, param_grid=params_xgboost, cv=folds, verbose=1, n_jobs=-1)

In [41]:
xgb.fit(X_train,y_train)



Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_ty...
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan

In [42]:
y_predict = xgb.predict(scaler.transform(X_test))



In [43]:

y_predict.mean()

0.9831460674157303

Evaluamos al modelo.

In [44]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.67      0.02      0.04       106
           1       0.41      0.99      0.57        72

    accuracy                           0.41       178
   macro avg       0.54      0.50      0.31       178
weighted avg       0.56      0.41      0.25       178



In [45]:
confusion_matrix(y_test, y_predict)

array([[  2, 104],
       [  1,  71]], dtype=int64)

In [46]:
xgb.best_score_


0.820020964360587

Guardamos al modelo con Pickle.

In [47]:
model_xgb = xgb.best_estimator_
# save
with open('./models/model_xgb.pkl','wb') as f:
    pickle.dump(model_xgb,f)

LightGBM

In [48]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [49]:
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier()

In [50]:
params_lightgbm = {  
    "n_estimators": [50,100,500],      # Number of boosted trees to fit.
    "max_depth": [1,2,3,4],            # Maximum tree depth for base learners.
    "learning_rate": [0.01, 0.1],      # Boosting learning rate (xgb’s “eta”)
}

In [51]:
grid_lgb = GridSearchCV(model_lgb, param_grid=params_lightgbm, cv=folds, verbose=1, n_jobs=-1)

In [52]:
grid_lgb.fit(X_train,y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=19, shuffle=True),
             estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1],
                         'max_depth': [1, 2, 3, 4],
                         'n_estimators': [50, 100, 500]},
             verbose=1)

In [53]:
grid_lgb.best_score_

0.8144304682040531