In [38]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [26]:
df = pd.read_csv('E:/Github/Reto-3006C-equipo5/retro/M4_Reto/Data/train_clean.csv')

In [27]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((708, 15), (178, 15))

#### Random forest

In [61]:
# Haremos una prueba con un random forest
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)
training_accuracy = accuracy_score(y_train, rf.predict(X_train))

print('Random Forest: {:.3f}'.format(testing_accuracy))
print('Random Forest: {:.3f}'.format(training_accuracy))

Random Forest: 0.764
Random Forest: 0.986


En esta prueba dado que el modelo esta teniendo muchos mejores resultados en el training que en el testing, podemos concluir que esta haciendo overfitting por lo que intentaremos utilizar un modelo con regularizacion para ver si podemos mejorar los resultados

In [84]:
clf = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],      
    'max_depth': [None, 10, 20, 30],      
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4],        
    'bootstrap': [True, False],          
    'criterion': ['gini', 'entropy'] 
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy',       
    cv=5,                     
    verbose=1,               
    n_jobs=-1                   
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_clf = grid_search.best_estimator_

testing_accuracy = accuracy_score(y_test, best_clf.predict(X_test))
training_accuracy = accuracy_score(y_train, best_clf.predict(X_train))

print('Random Forest: {:.3f}'.format(testing_accuracy))
print('Random Forest: {:.3f}'.format(training_accuracy))

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Random Forest: 0.809
Random Forest: 0.893


Despues de utilizar la optimizacion de parametros podemos ver que los resultados son mucho mejores, sin embargo, el modelo sigue haciendo overfitting.

#### Logistic Regression

In [91]:
# Haremos una prueba con un logistic regression
lr = LogisticRegression(random_state=42)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)
training_accuracy = accuracy_score(y_train, lr.predict(X_train))

print('Logistic Regression: {:.3f}'.format(testing_accuracy))
print('Logistic Regression: {:.3f}'.format(training_accuracy))

Logistic Regression: 0.775
Logistic Regression: 0.847


En esta prueba dado que el modelo esta teniendo muchos mejores resultados en el training que en el testing, podemos concluir que esta haciendo overfitting por lo que intentaremos utilizar un modelo con regularizacion para ver si podemos mejorar los resultados

In [89]:
clf = LogisticRegression(solver='saga')

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],     
    'C': [0.001, 0.01, 0.1, 1, 2, 5, 10, 100],          
    'l1_ratio': [0, 0.1, 0.2, 0.5, 1],
    'max_iter': [500, 1000, 2000, 3000, 5000, 10000]
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy',         
    cv=5,                       
    verbose=1,                 
    n_jobs=-1                   
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_clf = grid_search.best_estimator_

testing_accuracy = accuracy_score(y_test, best_clf.predict(X_test))
training_accuracy = accuracy_score(y_train, best_clf.predict(X_train))

print('Logistic Regression: {:.3f}'.format(testing_accuracy))
print('Logistic Regression: {:.3f}'.format(training_accuracy))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
{'C': 2, 'l1_ratio': 0.5, 'max_iter': 5000, 'penalty': 'elasticnet'}
Logistic Regression: 0.775
Logistic Regression: 0.833


Ahora despues de aplicar regularizacion podemos ver que los resultados son mucho mejores.

#### XGBoost

In [64]:
# Haremos una prueba con un xgboost
xgb = XGBClassifier(random_state=42)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)
training_accuracy = accuracy_score(y_train, xgb.predict(X_train))

print('XGBoost: {:.3f}'.format(testing_accuracy))
print('XGBoost: {:.3f}'.format(training_accuracy))

XGBoost: 0.787
XGBoost: 0.973


En esta prueba dado que el modelo esta teniendo muchos mejores resultados en el training que en el testing, podemos concluir que esta haciendo overfitting por lo que intentaremos utilizar grid search para optimizar los parametros del modelo y ver si podemos mejorar los resultados

In [79]:
clf = XGBClassifier()

param_grid = {
    'alpha': [0.1, 0.2, 0.5, 1, 2, 10],             
    'lambda': [0.05, 0.1, 0.2, 0.5, 1, 2, 10],            
    'gamma': [0, 0.1, 0.5, 1],             
    'max_depth': [1, 2, 3, 4, 5],
    'min_child_weight': [1, 2, 5, 10, 15, 20]   
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,                     
    verbose=1,            
    n_jobs=-1        
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_clf = grid_search.best_estimator_

testing_accuracy = accuracy_score(y_test, best_clf.predict(X_test))
training_accuracy = accuracy_score(y_train, best_clf.predict(X_train))

print('XGBoost: {:.3f}'.format(testing_accuracy))
print('XGBoost: {:.3f}'.format(training_accuracy))

Fitting 5 folds for each of 5040 candidates, totalling 25200 fits
{'alpha': 0.5, 'gamma': 0.5, 'lambda': 0.1, 'max_depth': 2, 'min_child_weight': 1}
XGBoost: 0.758
XGBoost: 0.884


Despues de utilizar la optimizacion de parametros podemos ver que los resultados son mucho mejores, sin embargo, el modelo sigue haciendo overfitting.

#### Voting Classifier

In [86]:
# Se crea una pipeline para cada modelo

# Random Forest a esta no se le aplica un scaler ya que al ser un modelo de arboles por steps todas las variables se reciben como independientes
rf_pipeline = Pipeline([
    ("rf", RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50, random_state=42))
])

# Logistic Regression estamos utilizando los parametros obtenidos en las pruebas individuales de este modelo
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(penalty='elasticnet', C=1.0, solver='saga', l1_ratio=0.5, max_iter=1000, random_state=42))
])

xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("xgb", XGBClassifier(alpha=0.1, gamma=0.5, max_depth=2, min_child_weight=1, random_state=42))
])

# Se crea un Voting Classifier con los modelos anteriores
voting_classifier = VotingClassifier(estimators=[
    ("rf", rf_pipeline),
    ("lr", lr_pipeline),
    ("xgb", xgb_pipeline)
], voting="soft")

# Se entrena el Voting Classifier
voting_classifier.fit(X_train, y_train)

testing_accuracy = accuracy_score(y_test, y_pred)
training_accuracy = accuracy_score(y_train, voting_classifier.predict(X_train))

print('Voting Classifier: {:.3f}'.format(testing_accuracy))
print('Voting Classifier: {:.3f}'.format(training_accuracy))

Voting Classifier: 0.764
Voting Classifier: 0.879


Despues de entrenar al voting classifier podemos ver que tiene mas overfitting que el modelo de logistic regression sin embargo el modelo de Logisitc Regression tiene cambios muy altos entre cada iteracion si dejamos que sea aleatorio, por lo que sacrificaremos un poco de overfitting para tener un modelo mas estable.