In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [3]:
# Cargar el conjunto de datos
file_path = 'diabetes_prediction_dataset_train-labeled.csv'
dataset = pd.read_csv(file_path)

# Separar características y variable objetivo
X = dataset.drop(columns=['diabetes', 'patient'])
y = dataset['diabetes']

# Identificar columnas categóricas y numéricas
categorical_features = ['gender', 'smoking_history']
numerical_features = X.columns.difference(categorical_features)

# Pipelines de preprocesamiento para datos numéricos y categóricos
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar los pipelines de preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])


In [4]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Male,4.0,0,0,never,19.28,3.5,155
1,Female,50.0,0,0,former,27.32,5.7,159
2,Female,43.0,0,0,never,21.54,4.5,145
3,Male,64.0,0,0,current,28.22,6.0,130
4,Male,10.0,0,0,No Info,15.46,6.1,140
...,...,...,...,...,...,...,...,...
94995,Female,56.0,0,0,never,22.30,6.2,158
94996,Female,24.0,0,0,never,22.50,6.1,126
94997,Female,32.0,0,0,never,24.92,6.6,145
94998,Female,21.0,0,0,current,20.59,4.5,155


In [5]:
y

0        0
1        0
2        0
3        0
4        0
        ..
94995    0
94996    0
94997    0
94998    0
94999    0
Name: diabetes, Length: 95000, dtype: int64

In [6]:
# Crear pipeline para la red neuronal
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42))])

# Entrenar el modelo
model_pipeline.fit(X, y)
