In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

df = sns.load_dataset("titanic")
df.head()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

sns.set_theme(style="whitegrid")

# Target
y = df["survived"]

# Features (sin leakage)
features = ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
X = df[features]



In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns

numeric_features, categorical_features


In [None]:
y = df["survived"]
X = df.drop(columns=["survived"])

# Opcional: si quieres quitar columnas muy “raras” o con muchos NA
# X = X.drop(columns=["deck"])  # (deck tiene muchos NaN)

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns

numeric_features, categorical_features


In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocess


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

baseline_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

baseline_clf.fit(X_train, y_train)
y_pred_base = baseline_clf.predict(X_test)

print("BASELINE (Logistic Regression)")
print("Accuracy:", accuracy_score(y_test, y_pred_base))
print("\nClassification report:\n", classification_report(y_test, y_pred_base))


In [None]:
cm = confusion_matrix(y_test, y_pred_base)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix - Baseline (Logistic Regression)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

advanced_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ))
])

advanced_clf.fit(X_train, y_train)
y_pred_adv = advanced_clf.predict(X_test)

print("ADVANCED (Random Forest)")
print("Accuracy:", accuracy_score(y_test, y_pred_adv))
print("\nClassification report:\n", classification_report(y_test, y_pred_adv))


In [None]:
cm_adv = confusion_matrix(y_test, y_pred_adv)

plt.figure(figsize=(5,4))
sns.heatmap(cm_adv, annot=True, fmt="d")
plt.title("Confusion Matrix - Advanced (Random Forest)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.metrics import f1_score

results = pd.DataFrame({
    "model": ["LogisticRegression", "RandomForest"],
    "accuracy": [
        accuracy_score(y_test, y_pred_base),
        accuracy_score(y_test, y_pred_adv)
    ],
    "f1": [
        f1_score(y_test, y_pred_base),
        f1_score(y_test, y_pred_adv)
    ]
})

results.sort_values(by="f1", ascending=False)


In [None]:
print("Built a full preprocessing + modeling pipeline (imputation + one-hot encoding).")


In [None]:
X.columns


In [None]:
X.shape, y.shape
