# Detectando Fumadores - Selección y Entrenamiento del Modelo

En este caso nos encontramos con un problema de clasificación binario ya que queremos detectar, según los biomarcadores, si los sujetos son fumadores o no para poder realizar intervenciones preventivas de promoción de la salud.

## Importando Bibliotecas

In [1]:
# Bibliotecas básicas de análisis de datos
import numpy as np
import pandas as pd

# Bibliotecas de visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Herramientas de preprocesamiento
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Modelos de clasificación
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Deep Learning
from keras.models import Sequential
from keras.layers import Dense

# Métricas de evaluación
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Para balancear clases desbalanceadas
from imblearn.over_sampling import SMOTE

# Otros
import joblib

# Configuración de visualización
%matplotlib inline
sns.set_style("whitegrid")

print('Bibliotecas importadas correctamente')

2024-04-09 15:46:15.725968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Bibliotecas importadas correctamente


## Carga de Datos

Con el fin de preparar un único script para la ejecución del modelo, vamos a cargar los datos en bruto y con el conocimiento adquirido en el análisis realizado en el notebook anterior vamos a optimizar la limpieza y la adecuación de los datos en el siguiente apartado.

In [2]:
train = pd.read_csv('../data/raw/train.csv')
test =pd.read_csv('../data/raw/test.csv')
print('Archivos cargados')

Archivos cargados


## Limpieza y adecuación de datos

In [3]:
# Eliminamos la columna de id, ya que no aporta nada para el entrenamiento
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
print("ID's eliminados")

'''Creamos tres nuevas características con biomarcadores relacionados 
para ver si podemos reducir la dimensionalidad del conjunto de datos más adelante
relacionado con indices de obesidad que hemos visto en el EDA que afectan'''

def crear_caract(train):
    train['IMC'] = train['weight(kg)'] / ((train['height(cm)'] / 100) ** 2)#Indice de masa muscular
    train['HW_Ratio'] = train['height(cm)'] / train['waist(cm)']#Ratio entre altura y tamaño de la cintura
    train['HA_Ratio'] = train['height(cm)'] / train['age']#Ratio entre altura y edad
    return train

train = crear_caract(train)
test = crear_caract(test)

# Vemos el describe para asegurarnos que se ha realizado correctamente
descripcion = train.describe(include='all') # El include='all' para ver todas las columnas
descripcion.transpose() # Resulta más cómodo de visualizar al tener tantas columnas

ID's eliminados


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,159256.0,44.306626,11.842286,20.0,40.0,40.0,55.0,85.0
height(cm),159256.0,165.266929,8.81897,135.0,160.0,165.0,170.0,190.0
weight(kg),159256.0,67.143662,12.586198,30.0,60.0,65.0,75.0,130.0
waist(cm),159256.0,83.00199,8.957937,51.0,77.0,83.0,89.0,127.0
eyesight(left),159256.0,1.005798,0.402113,0.1,0.8,1.0,1.2,9.9
eyesight(right),159256.0,1.000989,0.392299,0.1,0.8,1.0,1.2,9.9
hearing(left),159256.0,1.023974,0.152969,1.0,1.0,1.0,1.0,2.0
hearing(right),159256.0,1.023421,0.151238,1.0,1.0,1.0,1.0,2.0
systolic,159256.0,122.503648,12.729315,77.0,114.0,121.0,130.0,213.0
relaxation,159256.0,76.874071,8.994642,44.0,70.0,78.0,82.0,133.0


In [4]:
descripcion_test = test.describe(include='all') 
descripcion_test.transpose() 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,106171.0,44.426538,11.897138,20.0,40.0,40.0,55.0,85.0
height(cm),106171.0,165.221322,8.837065,135.0,160.0,165.0,170.0,190.0
weight(kg),106171.0,67.125618,12.586569,30.0,60.0,65.0,75.0,130.0
waist(cm),106171.0,82.999892,8.946584,51.0,77.0,83.0,89.0,127.7
eyesight(left),106171.0,1.004776,0.39769,0.1,0.8,1.0,1.2,9.9
eyesight(right),106171.0,0.999483,0.385752,0.1,0.8,1.0,1.2,9.9
hearing(left),106171.0,1.024216,0.153719,1.0,1.0,1.0,1.0,2.0
hearing(right),106171.0,1.02398,0.152988,1.0,1.0,1.0,1.0,2.0
systolic,106171.0,122.475403,12.765542,71.0,114.0,121.0,130.0,213.0
relaxation,106171.0,76.820676,9.018782,40.0,70.0,78.0,82.0,140.0


## Pruebas de Modelos

In [5]:
#Preparamos los datos para las pruebas de modelos
X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Regresión Logística

In [7]:
# Entrena y evalúa Regresión Logística
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

[[13508  4275]
 [ 3675 10394]]
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     17783
           1       0.71      0.74      0.72     14069

    accuracy                           0.75     31852
   macro avg       0.75      0.75      0.75     31852
weighted avg       0.75      0.75      0.75     31852



### Árbol de Decisión 

In [8]:
# Entrena y evalúa Árbol de Decisión
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)  # No es necesario escalar para árboles de decisión
y_pred_tree = tree_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

[[12967  4816]
 [ 4975  9094]]
              precision    recall  f1-score   support

           0       0.72      0.73      0.73     17783
           1       0.65      0.65      0.65     14069

    accuracy                           0.69     31852
   macro avg       0.69      0.69      0.69     31852
weighted avg       0.69      0.69      0.69     31852



### Random Forest

In [9]:
# Entrena y evalúa Random Forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

[[13347  4436]
 [ 2811 11258]]
              precision    recall  f1-score   support

           0       0.83      0.75      0.79     17783
           1       0.72      0.80      0.76     14069

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77      0.77     31852



### KNN

In [10]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print("Evaluando KNN:")
print(knn.score(X_test, y_test))

Evaluando KNN:
0.7053874168027126


### Naive Bayes

In [11]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
print("Evaluando Naive Bayes:")
print(nb.score(X_test, y_test))

Evaluando Naive Bayes:
0.7197350244882582


### Support Vector Machine (Classifier)

In [12]:
from sklearn.svm import LinearSVC

svc = LinearSVC(dual=False)
svc.fit(X_train, y_train)

### Random Forest

In [13]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print("Evaluando Random Forest:")
print(rf.score(X_test, y_test))

Evaluando Random Forest:
0.7717882707522291


### Deep Learning

In [14]:
# Deep Learning
# Definir la arquitectura de la red
model = Sequential()
model.add(Dense(12, input_dim=X.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Ajustar el modelo
model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=1)

print("Evaluando Deep Learning:")
# Aquí se evaluará la precisión del modelo
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - accuracy: 0.6767 - loss: 0.9364
Epoch 2/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.7376 - loss: 0.5218
Epoch 3/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.7447 - loss: 0.5027
Epoch 4/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.7493 - loss: 0.4944
Epoch 5/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 0.7525 - loss: 0.4884
Epoch 6/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - accuracy: 0.7520 - loss: 0.4883
Epoch 7/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - accuracy: 0.7518 - loss: 0.4857
Epoch 8/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 3ms/step - accuracy: 0.7536 - loss:

In [15]:
# Imprimimos resultados detallados
# KNN
y_pred_knn = knn.predict(X_test)
print("Matriz de Confusión para KNN:")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report para KNN:")
print(classification_report(y_test, y_pred_knn))

# Naive Bayes
y_pred_nb = nb.predict(X_test)
print("Matriz de Confusión para Naive Bayes:")
print(confusion_matrix(y_test, y_pred_nb))
print("Classification Report para Naive Bayes:")
print(classification_report(y_test, y_pred_nb))

# Random Forest
y_pred_rf = rf.predict(X_test)
print("Matriz de Confusión para Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report para Random Forest:")
print(classification_report(y_test, y_pred_rf))

# Deep Learning
# Aquí deberás primero transformar las probabilidades en clases binarias, asumiendo una clase positiva si la probabilidad es mayor a 0.5
y_pred_dl = (model.predict(X_test) > 0.5).astype("int32")
print("Matriz de Confusión para Deep Learning:")
print(confusion_matrix(y_test, y_pred_dl))
print("Classification Report para Deep Learning:")
print(classification_report(y_test, y_pred_dl))

Matriz de Confusión para KNN:
[[12876  4907]
 [ 4477  9592]]
Classification Report para KNN:
              precision    recall  f1-score   support

           0       0.74      0.72      0.73     17783
           1       0.66      0.68      0.67     14069

    accuracy                           0.71     31852
   macro avg       0.70      0.70      0.70     31852
weighted avg       0.71      0.71      0.71     31852

Matriz de Confusión para Naive Bayes:
[[12029  5754]
 [ 3173 10896]]
Classification Report para Naive Bayes:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73     17783
           1       0.65      0.77      0.71     14069

    accuracy                           0.72     31852
   macro avg       0.72      0.73      0.72     31852
weighted avg       0.73      0.72      0.72     31852

Matriz de Confusión para Random Forest:
[[13395  4388]
 [ 2881 11188]]
Classification Report para Random Forest:
              precision    reca

## Prueba con menos datos

In [19]:
train2 = pd.read_csv('../data/raw/train.csv')
test2 =pd.read_csv('../data/raw/test.csv')
print('Archivos cargados')

bad_columns=['eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'id']

train2 = train2.drop(bad_columns, axis=1, errors='ignore')
test2 = test2.drop(bad_columns, axis=1, errors='ignore')

train2 = crear_caract(train2)
test2 = crear_caract(test2)

X2 = train2.drop('smoking', axis=1)
y2 = train2['smoking']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled2 = scaler.fit_transform(X_train2)
X_test_scaled2 = scaler.transform(X_test2)

# Entrena y evalúa Regresión Logística
log_model2 = LogisticRegression()
log_model2.fit(X_train_scaled2, y_train2)
y_pred_log2 = log_model2.predict(X_test_scaled2)
print('Regresión Logística 2')
print('-----------------------------')
print(confusion_matrix(y_test2, y_pred_log2))
print(classification_report(y_test2, y_pred_log2))

# Entrena y evalúa Árbol de Decisión
tree_model2 = DecisionTreeClassifier()
tree_model2.fit(X_train2, y_train2)
y_pred_tree2 = tree_model2.predict(X_test2)
print('Árbol de decisión 2')
print('-----------------------------')
print(confusion_matrix(y_test2, y_pred_tree2))
print(classification_report(y_test2, y_pred_tree2))

# Entrena y evalúa Random Forest
forest_model2 = RandomForestClassifier()
forest_model2.fit(X_train2, y_train2)
y_pred_forest2 = forest_model2.predict(X_test2)
print('Random Forest 2')
print('-----------------------------')
print(confusion_matrix(y_test2, y_pred_forest2))
print(classification_report(y_test2, y_pred_forest2))

Archivos cargados
Regresión Logística 2
-----------------------------
[[13508  4275]
 [ 3675 10394]]
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     17783
           1       0.71      0.74      0.72     14069

    accuracy                           0.75     31852
   macro avg       0.75      0.75      0.75     31852
weighted avg       0.75      0.75      0.75     31852

Árbol de decisión 2
-----------------------------
[[13020  4763]
 [ 5011  9058]]
              precision    recall  f1-score   support

           0       0.72      0.73      0.73     17783
           1       0.66      0.64      0.65     14069

    accuracy                           0.69     31852
   macro avg       0.69      0.69      0.69     31852
weighted avg       0.69      0.69      0.69     31852

Random Forest 2
-----------------------------
[[13362  4421]
 [ 2839 11230]]
              precision    recall  f1-score   support

           0       0.82      0.7

In [20]:
# KNN
knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train2, y_train2)
print("Evaluando KNN:")
print(knn2.score(X_test2, y_test2))
y_pred_knn2 = knn2.predict(X_test2)
print("Matriz de Confusión para KNN:")
print(confusion_matrix(y_test2, y_pred_knn2))
print("Classification Report para KNN:")
print(classification_report(y_test2, y_pred_knn2))

Evaluando KNN:
0.7053874168027126
Matriz de Confusión para KNN:
[[12876  4907]
 [ 4477  9592]]
Classification Report para KNN:
              precision    recall  f1-score   support

           0       0.74      0.72      0.73     17783
           1       0.66      0.68      0.67     14069

    accuracy                           0.71     31852
   macro avg       0.70      0.70      0.70     31852
weighted avg       0.71      0.71      0.71     31852



In [21]:
# Naive Bayes
nb2 = GaussianNB()
nb2.fit(X_train2, y_train2)
print("Evaluando Naive Bayes:")
print(nb2.score(X_test2, y_test2))
y_pred_nb2 = nb2.predict(X_test2)
print("Matriz de Confusión para Naive Bayes:")
print(confusion_matrix(y_test2, y_pred_nb2))
print("Classification Report para Naive Bayes:")
print(classification_report(y_test2, y_pred_nb2))

Evaluando Naive Bayes:
0.7197350244882582
Matriz de Confusión para Naive Bayes:
[[12029  5754]
 [ 3173 10896]]
Classification Report para Naive Bayes:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73     17783
           1       0.65      0.77      0.71     14069

    accuracy                           0.72     31852
   macro avg       0.72      0.73      0.72     31852
weighted avg       0.73      0.72      0.72     31852



In [22]:
# Deep Learning
model2 = Sequential()
model2.add(Dense(12, input_dim=X.shape[1], activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model2.fit(X_train2, y_train2, epochs=150, batch_size=10, verbose=1)

print("Evaluando Deep Learning:")
_, accuracy = model2.evaluate(X_test2, y_test2)
print('Accuracy: %.2f' % (accuracy*100))

y_pred_dl2 = (model2.predict(X_test2) > 0.5).astype("int32")
print("Matriz de Confusión para Deep Learning:")
print(confusion_matrix(y_test2, y_pred_dl2))
print("Classification Report para Deep Learning:")
print(classification_report(y_test2, y_pred_dl2))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 964us/step - accuracy: 0.6894 - loss: 0.7387
Epoch 2/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 951us/step - accuracy: 0.7328 - loss: 0.5276
Epoch 3/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 977us/step - accuracy: 0.7426 - loss: 0.5104
Epoch 4/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 971us/step - accuracy: 0.7483 - loss: 0.4981
Epoch 5/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - accuracy: 0.7525 - loss: 0.4894
Epoch 6/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 990us/step - accuracy: 0.7539 - loss: 0.4854
Epoch 7/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 952us/step - accuracy: 0.7583 - loss: 0.4799
Epoch 8/150
[1m12741/12741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 948us/step - accuracy: 

In [22]:
train = pd.read_csv('../data/raw/train.csv')
test =pd.read_csv('../data/raw/test.csv')
print('Archivos cargados')

columnas_no_necesarias=['eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'id', ]

X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

train = train.drop(columnas_no_necesarias, axis=1, errors='ignore')
test = test.drop(columnas_no_necesarias, axis=1, errors='ignore')

forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)
print('Random Forest')
print('-----------------------------')
print(confusion_matrix(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

Archivos cargados
Random Forest
-----------------------------
[[13311  4472]
 [ 2882 11187]]
              precision    recall  f1-score   support

           0       0.82      0.75      0.78     17783
           1       0.71      0.80      0.75     14069

    accuracy                           0.77     31852
   macro avg       0.77      0.77      0.77     31852
weighted avg       0.77      0.77      0.77     31852



In [23]:
X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                       max_depth=3, random_state=42)

gbm_model.fit(X_train_scaled, y_train)

y_pred_gbm = gbm_model.predict(X_test_scaled)

print('Gradient Boosting Machine')
print('-----------------------------')
print(confusion_matrix(y_test, y_pred_gbm))
print(classification_report(y_test, y_pred_gbm))

Gradient Boosting Machine
-----------------------------
[[13182  4601]
 [ 2667 11402]]
              precision    recall  f1-score   support

           0       0.83      0.74      0.78     17783
           1       0.71      0.81      0.76     14069

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77      0.77     31852



## Selección de modelo

Decidiremos según las puntuaciones que saquen con los mejores hiperparámetros

In [24]:
gbm = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'min_samples_split': [2, 4, 6]
}

grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Mejores hiperparámetros: {grid_search.best_params_}")

best_gbm = grid_search.best_estimator_
y_pred = best_gbm.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=  37.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=  37.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=  37.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=  37.3s[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=  37.3s

[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.9; total time=  41.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.9; total time=  41.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.9; total time=  41.7s
[CV] END learnin

In [25]:
train = pd.read_csv('../data/raw/train.csv')

columnas_no_necesarias = ['eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'id']
train.drop(columns=columnas_no_necesarias, axis=1, errors='ignore', inplace=True)

X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8],
    'criterion': ['gini', 'entropy']
}

forest_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train_scaled, y_train)

print(f"Mejores hiperparámetros: {grid_search.best_params_}")

best_forest = grid_search.best_estimator_
y_pred = best_forest.predict(X_test_scaled)

print('Random Forest optimizado')
print('-----------------------------')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; total time=   0.1s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200; total time=   0.0s
[CV] END criterion=gini, max_depth=4, max_features=auto, n_estimators=300; total time=   0.0

### Validación cruzada

Centrandonos en el Recall ya que como hemos dicho, nos interesa descubrir a más fumadores y no pasa importa si no lo son ya que hablamos de prevención y promoción de la salud.

In [27]:
mejor_forest = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt', n_estimators=300, random_state=42)

scores_recall = cross_val_score(mejor_forest, X_train_scaled, y_train, cv=5, scoring='recall', n_jobs=-1)

print("Recall de la validación cruzada para cada pliegue: ", scores_recall)
print("Recall promedio de la validación cruzada: ", np.mean(scores_recall))

Recall de la validación cruzada para cada pliegue:  [0.83070972 0.83367815 0.83997481 0.83925519 0.82718604]
Recall promedio de la validación cruzada:  0.834160784133096


In [28]:
mejor_gbm = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_split=6, n_estimators=300, subsample=0.8, random_state=42)

scores_recall_gbm = cross_val_score(mejor_gbm, X_train, y_train, cv=5, scoring='recall', n_jobs=-1)

print("Recall de la validación cruzada para cada pliegue en GBC: ", scores_recall_gbm)
print("Recall promedio de la validación cruzada en GBC: ", np.mean(scores_recall_gbm))


Recall de la validación cruzada para cada pliegue en GBC:  [0.80966088 0.80705226 0.81190969 0.80768193 0.80514574]
Recall promedio de la validación cruzada en GBC:  0.8082900988708441


## Entrenamiento y guardado del modelo

Una vez decidido que entrenaremos el Random Forest Classifier por sus mejores puntuaciones y que los tiempos de computación son muy inferiores, lo entrenamos y lo guardamos para poder llevarlo a producción.

In [37]:
train = pd.read_csv('../data/raw/train.csv')

columnas_no_necesarias = ['eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'id']
train.drop(columns=columnas_no_necesarias, axis=1, errors='ignore', inplace=True)

X = train.drop('smoking', axis=1)
y = train['smoking']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, 'scaler.joblib')

mejor_forest = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt', n_estimators=300, random_state=42)

mejor_forest.fit(X_scaled, y)

joblib.dump(mejor_forest, 'random_forest_model.joblib')

print("Modelo entrenado y guardado exitosamente.")

Modelo entrenado y guardado exitosamente.


## Prueba de Predicción

In [32]:
test = pd.read_csv('../data/raw/test.csv')

test_ids = test['id']

columnas_no_necesarias = ['eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'id']
test.drop(columns=columnas_no_necesarias, axis=1, errors='ignore', inplace=True)

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(test)

mejor_forest = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt', n_estimators=300, random_state=42)
mejor_forest.fit(X_train_scaled, y_train)  

probabilidades = mejor_forest.predict_proba(X_test_scaled)[:, 1]  # Probabilidades de la clase positiva
predicciones = ['Fumador' if prob >= 0.6 else 'No Fumador' for prob in probabilidades]

resultado_df = pd.DataFrame({'id': test_ids, 'Predicción': predicciones})

resultado_df.to_csv('../data/results/predicciones_fumadores.csv', index=False)

print("Archivo de predicciones generado con éxito.")

Archivo de predicciones generado con éxito.


In [48]:
# Preparación de archivos para pruebas de producción

df_test = pd.read_csv('../data/raw/test.csv')  

for i in range(1, 11):
    df_sample = df_test.sample(n=10)
    df_sample.to_csv(f'../data/processed/prueba_{i}.csv', index=False)

print("Archivos generados y guardados correctamente.")

Archivos generados y guardados correctamente.


<div style="text-align: center;">
    <a href="../app/Detector_Fumadores_full.py">
        <button style="padding: 10px 20px; font-size: 16px; cursor: pointer;">Paso a Producción</button>
    </a>
</div>