In [5]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,classification_report

# 1. Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. Feature engineering
def preprocess_base(df):
    df = df.copy()
    # Family features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    # Title from Name
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)
    df["Title"] = df["Title"].replace(
        ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
        "Rare"
    )
    df["Title"] = df["Title"].replace(["Mlle","Ms"], "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")

    # Child flag (often important on Titanic)
    df["IsChild"] = (df["Age"] < 16).astype(int)

    # Ticket group size (how many share same ticket)
    df["TicketGroupSize"] = df.groupby("Ticket")["Ticket"].transform("count")

    return df

train_pp = preprocess_base(train)
test_pp = preprocess_base(test)

# Features and target
features = ["Pclass","Sex","Age","SibSp","Parch","Fare",
            "Embarked","FamilySize","IsAlone","Title","IsChild","TicketGroupSize"]
num_features = ["Age","SibSp","Parch","Fare","FamilySize",
                "IsAlone","Pclass","IsChild","TicketGroupSize"]
cat_features = ["Sex","Embarked","Title"]
X = train_pp[features]
y = train_pp["Survived"]

X_test_final = test_pp[features]

# 3. Preprocessing
num_features = ["Age","SibSp","Parch","Fare","FamilySize","IsAlone","Pclass"]
cat_features = ["Sex","Embarked","Title"]

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

# 4. Model (Gradient Boosting)
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.9, 
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

# 5. Cross-validation accuracy (uses only train.csv)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", cv_scores.mean())
print("CV std:", cv_scores.std())

# 6. Train on full train data and predict test.csv
clf.fit(X, y)
test_preds = clf.predict(X_test_final)

# 7. Create submission / output file
submission = pd.DataFrame({
    "PassengerId": test_pp["PassengerId"],
    "Survived": test_preds
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

#8. Confusion Matrics
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

clf.fit(X_train, y_train)
val_pred = clf.predict(X_val)

print("CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

print("\nCLASSIFICATION REPORT")
print(classification_report(y_val, val_pred))



CV mean accuracy: 0.8451070240411775
CV std: 0.010564690867159745
Saved submission.csv
CONFUSION MATRIX
[[99 11]
 [20 49]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       110
           1       0.82      0.71      0.76        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.83      0.83      0.82       179

