# Competencia de Kaggle


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report, precision_score, recall_score, f1_score

# Modelos de clasificación
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Ignorar warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

## Datasets


In [3]:
# Cargamos el dataset de entrenamiento
URL_TRAIN = 'https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/Pr%C3%A1ctico/diabetes_prediction_dataset_train-labeled.csv'
TRAIN_DF = pd.read_csv(URL_TRAIN)

# Cargamos el dataset de test
URL_TEST = 'https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/Pr%C3%A1ctico/diabetes_prediction_dataset_test.csv'
TEST_DF = pd.read_csv(URL_TEST)

## Funciones auxiliares


In [21]:
# PROBAR MODELOS CON PARÁMETROS POR DEFECTO Y AJUSTADOS

models = {
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {}
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'loss': ['log_loss', 'exponential'],
            'learning_rate': [0.1,  0.01],
            'n_estimators': [100, 200],
            'criterion': ['friedman_mse', 'squared_error'],
            'max_depth': [3, 5]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(random_state=42),
        'params': {}
    },
    'Bagging': {
        'model': BaggingClassifier(random_state=42),
        'params': {}
    },
    'HistGradientBoosting': {
        'model': HistGradientBoostingClassifier(random_state=42),
        'params': {}
    },
    'SVC': {
        'model': SVC(random_state=42),
        'params': {}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {}
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42),
        'params': {}
    },
    'MLP': {
        'model': MLPClassifier(random_state=42),
        'params': {}
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42),
        'params': {}
    }
}


def try_default_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    trained_models = pd.DataFrame(columns=['model', 'train_acc', 'train_prec',
                                  'train_rec', 'train_f1', 'test_acc', 'test_prec', 'test_rec', 'test_f1'])
    for name, model in models.items():
        print(f'Modelo {name}')

        clf = model['model']
        clf.fit(X_train, y_train)

        y_train_pred = clf.predict(X_train)
        y_test_pred = clf.predict(X_test)

        trained_models = trained_models._append({
            'model': name,
            'train_acc': accuracy_score(y_train, y_train_pred),
            'train_prec': precision_score(y_train, y_train_pred),
            'train_rec': recall_score(y_train, y_train_pred),
            'train_f1': f1_score(y_train, y_train_pred),
            'test_acc': accuracy_score(y_test, y_test_pred),
            'test_prec': precision_score(y_test, y_test_pred),
            'test_rec': recall_score(y_test, y_test_pred),
            'test_f1': f1_score(y_test, y_test_pred)
        }, ignore_index=True)

    return trained_models


def try_adjusted_model(X, y, clf_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    clf_params = models[clf_name]['params']
    clf = models[clf_name]['model']
    gv = GridSearchCV(clf, clf_params, cv=5, n_jobs=-1)
    gv.fit(X_train, y_train)

    y_train_pred = gv.predict(X_train)
    y_test_pred = gv.predict(X_test)

    return {
        'model': clf_name,
        'best_params': gv.best_params_,
        'train_acc': accuracy_score(y_train, y_train_pred),
        'train_prec': precision_score(y_train, y_train_pred),
        'train_rec': recall_score(y_train, y_train_pred),
        'train_f1': f1_score(y_train, y_train_pred),
        'test_acc': accuracy_score(y_test, y_test_pred),
        'test_prec': precision_score(y_test, y_test_pred),
        'test_rec': recall_score(y_test, y_test_pred),
        'test_f1': f1_score(y_test, y_test_pred)
    }

In [23]:
# CÁLCULO DE ACCURACY DE UN MODELO PARA K FOLDS

def get_accuracy(clf, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    accuracies = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))

    return accuracies

In [5]:
# GENERAMOS OUTPUT DE SALIDA

def generate_output(clf, filename=None):
    if filename is None:
        filename = clf.__class__.__name__

    # Cargamos el dataset de test
    test_df = TEST_DF.copy()

    # Preprocesamiento
    patient_id = test_df['patient']
    X_test = test_df.drop(columns=['diabetes', 'patient'])

    for col in ["gender", "smoking_history"]:
        X_test[col] = X_test[col].astype(str)
        X_test[col] = LabelEncoder().fit_transform(X_test[col])

    cols = X_test.columns
    X_test = StandardScaler().fit_transform(X_test)
    X_test = pd.DataFrame(X_test, columns=cols)

    # Predicción
    test_pred = np.int64(clf.predict(X_test))
    submission = pd.DataFrame(list(zip(patient_id, test_pred)), columns=[
                              'patient', 'diabetes'])
    submission.to_csv(f'output/{filename}.csv', index=False, header=True)

## Desarrollo


In [6]:
train_df = TRAIN_DF.copy()

# Separamos la variable objetivo
patientId = train_df.patient
y = train_df.diabetes
X = train_df.drop(columns=['diabetes', 'patient'])

# Encoding de variables categóricas
for col in ["gender", "smoking_history"]:
    X[col] = X[col].astype(str)
    X[col] = LabelEncoder().fit_transform(X[col])

# Escalamos las variables
X_names = X.columns
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns=X_names)

# Probamos los modelos por defecto
trained_models = try_default_models(X, y)
trained_models = trained_models.sort_values(
    by='test_acc', ascending=False).reset_index(drop=True)
display(trained_models)

Modelo Decision Tree
Modelo Random Forest
Modelo Gradient Boosting
Modelo AdaBoost
Modelo Bagging
Modelo HistGradientBoosting
Modelo SVC
Modelo KNN
Modelo Logistic Regression
Modelo MLP
Modelo XGBoost


Unnamed: 0,model,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,HistGradientBoosting,0.973474,0.992069,0.694586,0.817093,0.972053,0.980216,0.681676,0.804131
1,AdaBoost,0.972013,0.97617,0.688724,0.807633,0.972,0.973381,0.686054,0.804842
2,Gradient Boosting,0.972237,0.987731,0.683017,0.807587,0.971947,0.981917,0.679174,0.802957
3,MLP,0.972329,0.98153,0.68857,0.809355,0.971684,0.965759,0.68793,0.803506
4,XGBoost,0.975803,0.989873,0.723739,0.83614,0.971632,0.959272,0.692308,0.804214
5,Random Forest,0.999197,0.999378,0.991208,0.995276,0.970158,0.954225,0.677924,0.792687
6,Bagging,0.995105,0.996103,0.946321,0.970574,0.969368,0.924103,0.692933,0.791994
7,SVC,0.964303,0.980132,0.593552,0.73936,0.963947,0.966327,0.592245,0.734393
8,KNN,0.969684,0.94316,0.685948,0.794249,0.961526,0.878049,0.630394,0.733892
9,Logistic Regression,0.960184,0.868315,0.628567,0.729241,0.960895,0.867698,0.631645,0.731089


In [22]:
# Probamos ajustar el modelo de Gradient Boosting
gb_model = try_adjusted_model(X, y, 'Gradient Boosting')
display(pd.DataFrame(gb_model, index=[0]))

print(f'Mejores parámetros para Gradient Boosting: {gb_model["best_params"]}')

Unnamed: 0,model,best_params,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,Gradient Boosting,,0.972237,0.987731,0.683017,0.807587,0.971947,0.981917,0.679174,0.802957


Mejores parámetros para Gradient Boosting: {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'n_estimators': 100}


In [28]:
# Miremos la accuracy que obtenemos con el modelo ajustado de Gradient Boosting para KFold con 10 splits
accuracies = get_accuracy(GradientBoostingClassifier(
    **gb_model['best_params']), X, y, n_splits=10)
print(f'La accuracy para Gradient Boosting con 10 splits es: {
      np.mean(accuracies)} +/- {np.std(accuracies)}')

La accuracy para Gradient Boosting con 10 splits es: 0.9719684210526316 +/- 0.0019174371168184138


In [27]:
# Generamos el output
gb_model_adj = GradientBoostingClassifier(**gb_model['best_params'])
gb_model_adj.fit(X, y)
generate_output(gb_model_adj)