En esta notebook entrenaremos y evaluaremos diferentes modelos para predecir la variable target "survived"

In [1]:
# importo todas las librerías necesarias
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pickle

In [2]:
data='./db/titanic/train_clean.csv'
train = pd.read_csv(data ,sep='\t', encoding='utf-8')
train.head(10)

Unnamed: 0.1,Unnamed: 0,survived,age,fare,pclass_2,pclass_3,sex_male,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,parch_1,parch_2,parch_3,parch_4,parch_5,parch_6,embarked_Q,embarked_S
0,0,0,22.0,7.25,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1,26.0,7.925,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,1,35.0,53.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,4,0,35.0,8.05,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
5,6,0,54.0,51.8625,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
6,7,0,2.0,21.075,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,1
7,8,1,27.0,11.1333,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
8,9,1,14.0,30.0708,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9,10,1,4.0,16.7,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1


Ahora separamos en train y test el df_train para entrenar y evaluar modelos.

In [3]:
train.columns

Index(['Unnamed: 0', 'survived', 'age', 'fare', 'pclass_2', 'pclass_3',
       'sex_male', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5',
       'parch_1', 'parch_2', 'parch_3', 'parch_4', 'parch_5', 'parch_6',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [4]:
X = train.drop(columns=['Unnamed: 0', 'survived'],axis=1)
y = train.survived

In [5]:
y.value_counts(normalize=True)

0    0.596343
1    0.403657
Name: survived, dtype: float64

Vemos que hay una pequeño desbalanceo de clases, por lo que cuando separamos en train y test lo hacemos con un stratify para compenzar esa diferencia.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, random_state=42)

Comenzamos con un modelo de regresión logística.
Utilizamos skalearn para standarizar la matriz de featues.

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
clf = LogisticRegression(C=1e10,class_weight='balanced')

In [49]:
params = {'C': [1, 10, 100, 1000],
     'penalty': ['l2','l1'],
     'solver': ['liblinear'],
     'max_iter':[1,10,100,1000,10000]}
folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [50]:
grid_clf = GridSearchCV(estimator=clf, param_grid=params, scoring='accuracy', cv=folds, n_jobs=-1)

In [51]:
fit_clf = grid_clf.fit(X_train,y_train)

In [52]:
fit_clf.best_params_

{'C': 100, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [53]:
fit_clf.best_score_

0.7975541579315165

In [54]:
y_predict = fit_clf.predict(X_test)

Evaluamos al modelo.

In [55]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       106
           1       0.70      0.79      0.74        72

    accuracy                           0.78       178
   macro avg       0.77      0.78      0.77       178
weighted avg       0.78      0.78      0.78       178



In [56]:
confusion_matrix(y_test, y_predict)

array([[81, 25],
       [15, 57]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [57]:
model_clf = fit_clf.best_estimator_
# save
with open('./models/model_clf.pkl','wb') as f:
    pickle.dump(model_clf,f)

Ahora probaremos un modelo de k neighborns

In [58]:
k_params = {'n_neighbors': range(1,200),
     'weights' : ['uniform', 'distance'],
     'p' : [1, 2, 3]}


In [59]:
kneig = KNeighborsClassifier()

In [60]:
grid_kneig = GridSearchCV(estimator=kneig, param_grid=k_params, scoring='accuracy', cv=folds, n_jobs=-1)

In [61]:
fit_kn = grid_kneig.fit(X_train,y_train)

In [62]:
fit_kn.best_score_

0.8141509433962264

In [63]:
y_predict = fit_kn.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Evaluamos al modelo.

In [64]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.75      0.90      0.82       106
           1       0.78      0.56      0.65        72

    accuracy                           0.76       178
   macro avg       0.77      0.73      0.73       178
weighted avg       0.76      0.76      0.75       178



In [65]:
confusion_matrix(y_test, y_predict)

array([[95, 11],
       [32, 40]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [66]:
model_kn = fit_kn.best_estimator_
# save
with open('./models/model_kn.pkl','wb') as f:
    pickle.dump(model_kn,f)

Ahora probaremos con modelos de clasificación (árboles)

In [67]:
num_leafs = [1, 5, 10, 20, 50, 100]
depths = np.arange(1, 21)
criterion = ['gini','entropy']

In [68]:
param = [{'criterion' :criterion,'max_depth':depths,
              'min_samples_leaf':num_leafs}]

In [69]:

tree_model = tree.DecisionTreeClassifier(random_state=1)

In [70]:
gs = GridSearchCV(estimator = tree_model, param_grid=param, scoring='accuracy', cv=folds,n_jobs=-1)

In [71]:
fit_tree = gs.fit(X_train,y_train)

In [72]:
gs.best_score_

0.8238294898672258

In [73]:
y_predict = gs.predict(X_test)

Evaluamos al modelo.

In [74]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       106
           1       0.78      0.69      0.74        72

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.79       178
weighted avg       0.80      0.80      0.80       178



In [75]:
confusion_matrix(y_test, y_predict)

array([[92, 14],
       [22, 50]], dtype=int64)

Guardamos el modelo con Pickle, para luego abrirlo con la notebook de predicciones.

In [76]:
model_tree = fit_tree.best_estimator_
# save
with open('./models/model_tree.pkl','wb') as f:
    pickle.dump(model_tree,f)

XGBBoost

In [77]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [78]:
from xgboost.sklearn import XGBClassifier
model_xg = XGBClassifier(n_jobs=-1, use_label_encoder=False)



In [79]:
params_xgboost = {  
    "n_estimators": [50,100,500],      # Number of boosted trees to fit.
    "max_depth": [1,2,3,4],            # Maximum tree depth for base learners.
    "learning_rate": [0.01, 0.1],      # Boosting learning rate (xgb’s “eta”)
}

In [80]:
xgb = GridSearchCV(model_xg, param_grid=params_xgboost, cv=folds, verbose=1, n_jobs=-1)

In [81]:
xgb.fit(X_train,y_train)



Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_ty...
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan

In [82]:
y_predict = xgb.predict(scaler.transform(X_test))



In [83]:

y_predict.mean()

0.9831460674157303

Evaluamos al modelo.

In [84]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.67      0.02      0.04       106
           1       0.41      0.99      0.57        72

    accuracy                           0.41       178
   macro avg       0.54      0.50      0.31       178
weighted avg       0.56      0.41      0.25       178



In [85]:
confusion_matrix(y_test, y_predict)

array([[  2, 104],
       [  1,  71]], dtype=int64)

In [86]:
xgb.best_score_


0.820020964360587

Guardamos al modelo con Pickle.

In [87]:
model_xgb = xgb.best_estimator_
# save
with open('./models/model_xgb.pkl','wb') as f:
    pickle.dump(model_xgb,f)

LightGBM

In [88]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [89]:
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier()

In [90]:
params_lightgbm = {  
    "n_estimators": [50,100,500],      # Number of boosted trees to fit.
    "max_depth": [1,2,3,4],            # Maximum tree depth for base learners.
    "learning_rate": [0.01, 0.1],      # Boosting learning rate (xgb’s “eta”)
}

In [91]:
grid_lgb = GridSearchCV(model_lgb, param_grid=params_lightgbm, cv=folds, verbose=1, n_jobs=-1)

In [92]:
grid_lgb.fit(X_train,y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=19, shuffle=True),
             estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1],
                         'max_depth': [1, 2, 3, 4],
                         'n_estimators': [50, 100, 500]},
             verbose=1)

In [93]:
grid_lgb.best_score_

0.8144304682040531

In [94]:
model_lgb = grid_lgb.best_estimator_
# save
with open('./models/model_lgb.pkl','wb') as f:
    pickle.dump(model_lgb,f)

Ahora intentaremos, mediante SMOTENC, resolver el problema de desbalanceo de clases.

In [98]:
X

Unnamed: 0,age,fare,pclass_2,pclass_3,sex_male,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,parch_1,parch_2,parch_3,parch_4,parch_5,parch_6,embarked_Q,embarked_S
0,22.0,7.2500,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,26.0,7.9250,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,35.0,8.0500,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,39.0,29.1250,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
707,27.0,13.0000,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
708,19.0,30.0000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
709,26.0,30.0000,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [100]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     -------------------------------------- 226.0/226.0 kB 7.0 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     ------------------------------------- 298.0/298.0 kB 19.2 MB/s eta 0:00:00
Installing collected packages: joblib, imbalanced-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 joblib-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [122]:
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

smote=SMOTENC(categorical_features=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],sampling_strategy='minority',random_state=123);
oversampler=RandomOverSampler(sampling_strategy='minority',random_state=123); # iguala las clases
undersampler=RandomUnderSampler(sampling_strategy='majority',random_state=123); # iguala las clases
xgb_clasificador = xgb.XGBClassifier()
dt = DecisionTreeClassifier(max_depth = 3, min_samples_split = 2, random_state = 11)
cv = StratifiedKFold(n_splits=5, random_state=983, shuffle=True)

In [123]:
# Creo una clase para poder utilizar los estimadores en un gridsearch
class ClassifierWrapper(BaseEstimator):
    """
    A Custom Wrapper that can switch between classifiers.
    """ 
    def __init__(self, estimator=None):
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

In [124]:
# # Ahora con un pipeline y un gridsearch exploramos la mejor combinación de hiperparámetros para la variable objetivo 'recall

# # Hago un grid search en un pipeline completo

pipe_full = Pipeline([('feature_selection',SelectKBest(chi2, k = 10)),
    ('feature_engineering',SelectKBest(chi2, k = 10)),
        ('preprocesamiento', StandardScaler()),
    ('switchable', ClassifierWrapper())])


hyperparameters = [
   
    {
        # CLasificador Logistic regresion
        'feature_selection': [SelectKBest(chi2,k=10),None],
        'feature_engineering':[SelectKBest(chi2, k = 10),smote,oversampler,undersampler, None],
        'preprocesamiento':[StandardScaler(), MinMaxScaler(),None],
        'switchable__estimator': [GaussianNB()],
        'switchable__estimator__var_smoothing': np.logspace(0,-9, num=100),
     },
    {
        # CLasificador Logistic regresion
        'feature_selection': [SelectKBest(chi2,k=10),None],
        'feature_engineering':[SelectKBest(chi2, k = 10),smote,oversampler,undersampler, None],
        'preprocesamiento':[StandardScaler(), MinMaxScaler(),None],
        'switchable__estimator': [LogisticRegression()],
        'switchable__estimator__C': [1, 10, 100, 1000],
        'switchable__estimator__penalty': ['l1', 'l2',],
        'switchable__estimator__solver': ['saga'],
     },
    {
         # GradientBoostClassifier
        'feature_selection': [SelectKBest(chi2,k=10),None],
        'feature_engineering':[SelectKBest(chi2, k = 10),smote,oversampler,undersampler, None],
        'preprocesamiento':[StandardScaler(), MinMaxScaler(),None],
        'switchable__estimator': [GradientBoostingClassifier()],
        'switchable__estimator__n_estimators':[5,50,250,500],
        'switchable__estimator__max_depth':[1,3,5,7,9],
        'switchable__estimator__learning_rate':[0.01,0.1,0.2,0.3,0.4,0.5]
     },
    {
         # RandomforestClassifier
        'feature_selection': [SelectKBest(chi2,k=10),None],
        'feature_engineering':[SelectKBest(chi2, k = 10),smote,oversampler,undersampler, None],
        'preprocesamiento':[StandardScaler(), MinMaxScaler(),None],
        'switchable__estimator': [RandomForestClassifier(random_state = 5)],
        'switchable__estimator__n_estimators': [1,100,200, 500,1000],
        'switchable__estimator__max_features': ['auto', 'sqrt', 'log2'],
        'switchable__estimator__max_depth' : [4,5,6,7,8,9,10,11,12],
        'switchable__estimator__criterion' :['gini', 'entropy']
     },
     {
         # XGBoost Classifier
        'feature_selection': [SelectKBest(chi2,k=10),None],
        'feature_engineering':[SelectKBest(chi2, k = 10),smote,oversampler,undersampler, None],
        'preprocesamiento':[StandardScaler(), MinMaxScaler(),None],
        'switchable__estimator': [xgb_clasificador],
        "switchable__estimator__n_estimators": [50,100,500,1000],      
        "switchable__estimator__max_depth": [1,2,3,4,5],           
        "switchable__estimator__learning_rate": [0.01, 0.1,0.2,0.3,0.4]   
   
     },

 ]


grid_pipe_recall = GridSearchCV(pipe_full, hyperparameters, cv=cv, scoring='recall',verbose=1,n_jobs=-1)

# # grid_pipe_recall.fit(X_train, y_train);

# # # Guardo el mejor modelo

In [125]:
grid_pipe_recall.fit(X_train, y_train)

Fitting 5 folds for each of 17940 candidates, totalling 89700 fits


53820 fits failed out of a total of 89700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53820 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\imblearn\pipeline.py", line 240, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Joaquin\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=983, shuffle=True),
             estimator=Pipeline(steps=[('feature_selection',
                                        SelectKBest(score_func=<function chi2 at 0x000002223253BA60>)),
                                       ('feature_engineering',
                                        SelectKBest(score_func=<function chi2 at 0x000002223253BA60>)),
                                       ('preprocesamiento', StandardScaler()),
                                       ('switchable', ClassifierWrapper())]),
             n_jobs=-1,
             p...
                                                                  max_leaves=None,
                                                                  min_child_weight=None,
                                                                  missing=nan,
                                                                  monotone_constraints=None,
                                                

In [133]:
grid_pipe_recall.fit(X_train, y_train)


Fitting 5 folds for each of 17940 candidates, totalling 89700 fits


KeyboardInterrupt: 

In [126]:
grid_pipe_recall.best_estimator_

Pipeline(steps=[('feature_selection', None),
                ('feature_engineering',
                 RandomOverSampler(random_state=123,
                                   sampling_strategy='minority')),
                ('preprocesamiento', StandardScaler()),
                ('switchable',
                 ClassifierWrapper(estimator=GaussianNB(var_smoothing=1.519911082952933e-09)))])

In [127]:
grid_pipe_recall.best_score_

0.9953488372093023

In [129]:
model = grid_pipe_recall.best_estimator_

with open('./models/model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [130]:
df = pd.read_csv('./db/titanic/test_with_age.csv',encoding='utf-8')
df.drop(columns=['Unnamed: 0','passengerid','name','ticket','cabin'],axis=1,inplace=True)
df = pd.get_dummies(df, columns=['pclass','sex','sibsp','parch','embarked'],drop_first=True)
df.drop(columns=['sibsp_8','parch_9'],axis=1,inplace=True)
df.fare.fillna(value=0,inplace=True)

In [132]:
X_ = scaler.transform(df)

In [None]:


y_prediction = model.predict(X_)

data_ = {'PassengerId':data_to_get_id.passengerid,'Survived':y_prediction}
out = pd.DataFrame(data_)
out
filename = './db/titanic/Titanic Predictions 1'+'model'+'.csv'
out.to_csv(filename,index=False)
print('Saved file: ' + filename)