In [89]:
import pandas as pd
import numpy as np
import sklearn as sk
from IPython.display import display
DATASET = pd.read_csv('Titanic-Dataset.csv')
display(DATASET.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [90]:
display(DATASET.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [91]:
DATASET.fillna({'Embarked': 'Unknown'}, inplace=True)
print(f"пропусков в Embarked: {int(DATASET['Embarked'].isna().sum())}")

пропусков в Embarked: 0


In [92]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import zlib  # для стабильного хэша 

class CabinFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, cabin_col: str = "Cabin", add_hash: bool = False,
                 n_buckets: int = 128, drop_original: bool = False):
        self.cabin_col = cabin_col
        self.add_hash = add_hash
        self.n_buckets = n_buckets
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        X = X.copy()

        col = self.cabin_col
        if col not in X.columns:   # проверяем именно входной X
            return X

        s = X[col].fillna("").astype(str).str.strip()
        tokens = s.str.split()
        first_tok = tokens.str[0].fillna("")

        # 1) есть ли каюта
        X["cabin_present"] = (s != "").astype(int)

        # 2) палуба: первая буква, иначе Unknown
        deck_letter = first_tok.str.extract(r"^([A-Za-z])", expand=False)
        X["deck"] = deck_letter.fillna("Unknown")

        # 3) сколько токенов (кают) указано
        X["cabin_count"] = tokens.apply(lambda x: len(x) if isinstance(x, list) else 0).astype(int)

        # 4) есть ли цифры в первом токене
        X["cabin_num_exists"] = first_tok.str.contains(r"\d", regex=True).astype(int)  # <-- с 's'

        # 5) стабильный хэш (опция)
        if self.add_hash:
            X["cabin_hash"] = first_tok.apply(
                lambda t: zlib.crc32(t.encode("utf-8")) % self.n_buckets
            ).astype(int)

        if self.drop_original:
            X = X.drop(columns=[col])

        return X  # всегда возвращаем


In [93]:
target_col = 'Survived'
y = DATASET[target_col].astype(int)
X = DATASET.drop(columns=[target_col])
cat_cols = ["Sex", "Embarked", "deck"]
num_cols = [ "Pclass", "Age", "SibSp", "Parch", "Fare",
    "cabin_present", "cabin_count", "cabin_num_exists"]


In [97]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

num_pipeline = Pipeline([
    ('imp', SimpleImputer(strategy='median')), 
    ('sc', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

prep = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop'
)

pipe_logreg = Pipeline([
    ('cabin_feat', CabinFeaturizer(cabin_col='Cabin', add_hash=True, drop_original=True)),
    ('prep', prep),
    ('clf', LogisticRegression(max_iter=2000))
])

CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'roc_auc': 'roc_auc'
}

res = cross_validate(
    pipe_logreg,
    X, y,
    cv=CV,
    scoring=scoring,
    n_jobs=1,
    return_train_score=False
)

print("LogReg 5-fold CV")
print("  accuracy: %.3f ± %.3f" % (res["test_accuracy"].mean(), res["test_accuracy"].std()))
print("  f1      : %.3f ± %.3f" % (res["test_f1"].mean(),       res["test_f1"].std()))
print("  roc_auc : %.3f ± %.3f" % (res["test_roc_auc"].mean(),  res["test_roc_auc"].std()))

svm_pipeline = Pipeline([
    ('cabin_feat', CabinFeaturizer(cabin_col="Cabin", add_hash=False, drop_original=True)),
    ('prep', prep),
     ("clf", SVC(kernel="rbf", probability=True))
])

res_svm = cross_validate(
    svm_pipeline,
    X,y,
    cv=CV,
    scoring=scoring,
    return_train_score=False
)
print("SVM (RBF) 5-fold CV")
print("  accuracy: %.3f ± %.3f" % (res_svm["test_accuracy"].mean(), res_svm["test_accuracy"].std()))
print("  f1      : %.3f ± %.3f" % (res_svm["test_f1"].mean(),       res_svm["test_f1"].std()))
print("  roc_auc : %.3f ± %.3f" % (res_svm["test_roc_auc"].mean(),  res_svm["test_roc_auc"].std()))

pipe_knn = Pipeline([
    ("cabin_feat", CabinFeaturizer(cabin_col="Cabin", add_hash=False, drop_original=True)),
    ("prep", prep),  # тот же ColumnTransformer (импутация + скейл + OHE)
    ("clf", KNeighborsClassifier(n_neighbors=5, weights="uniform", metric="minkowski", p=2))
])

res_knn = cross_validate(
    pipe_knn, X, y, cv=CV, scoring=scoring, n_jobs=-1, return_train_score=False
)
print("KNN (k=5) 5-fold CV")
print("  accuracy: %.3f ± %.3f" % (res_knn["test_accuracy"].mean(), res_knn["test_accuracy"].std()))
print("  f1      : %.3f ± %.3f" % (res_knn["test_f1"].mean(),       res_knn["test_f1"].std()))
print("  roc_auc : %.3f ± %.3f" % (res_knn["test_roc_auc"].mean(),  res_knn["test_roc_auc"].std()))

LogReg 5-fold CV
  accuracy: 0.801 ± 0.019
  f1      : 0.734 ± 0.031
  roc_auc : 0.851 ± 0.005
SVM (RBF) 5-fold CV
  accuracy: 0.815 ± 0.047
  f1      : 0.748 ± 0.063
  roc_auc : 0.847 ± 0.047
KNN (k=5) 5-fold CV
  accuracy: 0.802 ± 0.033
  f1      : 0.733 ± 0.047
  roc_auc : 0.842 ± 0.032
