In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, accuracy_score

url = 'https://gitlab.com/francisco.arduh/datasets/-/raw/main/Titanic-Dataset.csv'
data = pd.read_csv(url)

#En la variable de características borro las columnas irrelevantes
X=data.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
Y=data['Survived']

# Codificar variables categóricas
le = LabelEncoder()
X['Sex'] = le.fit_transform(X['Sex'])
X['Embarked'] = le.fit_transform(X['Embarked'].fillna('S'))

# Eliminar filas con datos faltantes
X = X.dropna()

# Dividir en conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Definir los modelos a evaluar
log_clf = LogisticRegression(random_state=42)
forest_clf = RandomForestClassifier(random_state=42)

# Crear un pipeline para cada modelo
log_pipeline = make_pipeline(
    StandardScaler(),
    log_clf
)
forest_pipeline = make_pipeline(
    StandardScaler(),
    forest_clf
)

# Entrenar y evaluar los modelos con validación cruzada
log_auc_scores = cross_val_score(log_pipeline, x_train, y_train, cv=3, scoring='roc_auc')
forest_auc_scores = cross_val_score(forest_pipeline, x_train, y_train, cv=3, scoring='roc_auc')

# Elegir el modelo con mejor AUC-ROC
if log_auc_scores.mean() > forest_auc_scores.mean():
    best_model = log_pipeline
    print("Mejor modelo: Logistic Regression")
else:
    best_model = forest_pipeline
    print("Mejor modelo: Random Forest Classifier")

# Entrenar el mejor modelo con todos los datos de entrenamiento
best_model.fit(x_train, y_train)

# Predecir las clases y los puntajes para la región de prueba
y_pred = best_model.predict(x_test)
y_scores = best_model.predict_proba(x_test)[:, 1]

# Calcular y mostrar las métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_scores))

ValueError: Found input variables with inconsistent numbers of samples: [714, 891]