In [1]:
from pathlib import Path 
import pandas as pd
import numpy as np

def load_titanic_data_train():
    return pd.read_csv(Path("titanic/train.csv"))

def load_titanic_data_test():
    return pd.read_csv(Path("titanic/test.csv"))

titanic_train = load_titanic_data_train()

titanic_train_X = titanic_train.drop("Survived", axis=1)
titanic_train_y = titanic_train["Survived"].copy()

titanic_train_y = pd.DataFrame(titanic_train_y)
titanic_train_y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [2]:
titanic_train_X["Title"] = titanic_train_X["Name"].str.extract(r' ([A-Za-z]+)\.')

# Replace rare titles
title_mapping = {
    "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master",
    "Dr": "RareTitle", "Rev": "RareTitle", "Col": "RareTitle", "Major": "RareTitle", "Capt": "RareTitle",
    "Don": "Noble", "Sir": "Noble", "Lady": "Noble", "Countess": "Noble",
    "Mme": "Miss", "Mlle": "Miss"  # French equivalents
}

titanic_train_X["Title"] = titanic_train_X["Title"].map(lambda x: title_mapping.get(x, "RareTitle"))

# Check title distribution
print(titanic_train_X["Title"].value_counts())

Title
Mr           517
Miss         185
Mrs          125
Master        40
RareTitle     20
Noble          4
Name: count, dtype: int64


In [3]:
titanic_train_X = titanic_train_X.drop(columns=["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"])

In [4]:
titanic_train_X.columns

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Title'], dtype='object')

In [5]:
skewness = titanic_train_X.skew(numeric_only=True)
print(skewness)

Pclass   -0.630548
Age       0.389108
Fare      4.787317
dtype: float64


In [6]:
titanic_train_X["Fare"] = np.log1p(titanic_train_X["Fare"])

In [7]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)

titanic_train_X["Age"] = knn_imputer.fit_transform(titanic_train_X[["Age"]])

In [8]:
bins = [-1, 12, 18, 35, 60, 100]  # Define age groups
labels = ["Child", "Teen", "YoungAdult", "Adult", "Senior"]

titanic_train_X["AgeGroup"] = pd.cut(titanic_train_X["Age"], bins, labels=labels)

# Convert to numerical codes (for ML models)
titanic_train_X["AgeGroup"] = titanic_train_X["AgeGroup"].astype("category").cat.codes

In [9]:
fare_bins = [-1, 1.5, 3, 4.5, 6.5]  # Define fare groups
fare_labels = ["Low", "Medium", "High", "VeryHigh"]

titanic_train_X["FareGroup"] = pd.cut(titanic_train_X["Fare"], bins=fare_bins, labels=fare_labels)

# Convert to numerical codes
titanic_train_X["FareGroup"] = titanic_train_X["FareGroup"].astype("category").cat.codes


In [10]:
titanic_train_X

Unnamed: 0,Pclass,Sex,Age,Fare,Title,AgeGroup,FareGroup
0,3,male,22.000000,2.110213,Mr,2,1
1,1,female,38.000000,4.280593,Mrs,3,2
2,3,female,26.000000,2.188856,Miss,2,1
3,1,female,35.000000,3.990834,Mrs,2,2
4,3,male,35.000000,2.202765,Mr,2,1
...,...,...,...,...,...,...,...
886,2,male,27.000000,2.639057,RareTitle,2,1
887,1,female,19.000000,3.433987,Miss,2,2
888,3,female,29.699118,3.196630,Miss,2,2
889,1,male,26.000000,3.433987,Mr,2,2


In [11]:
titanic_train_X = titanic_train_X.drop(columns=["Age", "Fare"])

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
titanic_train_X["Title"] = label_encoder.fit_transform(titanic_train_X["Title"])


In [13]:
titanic_train_X["Sex"] = titanic_train_X["Sex"].map({"male": 1, "female": 0})

In [14]:
titanic_train_X

Unnamed: 0,Pclass,Sex,Title,AgeGroup,FareGroup
0,3,1,2,2,1
1,1,0,3,3,2
2,3,0,1,2,1
3,1,0,3,2,2
4,3,1,2,2,1
...,...,...,...,...,...
886,2,1,5,2,1
887,1,0,1,2,2
888,3,0,1,2,2
889,1,1,2,2,2


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class TitleExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["Title"] = X["Name"].str.extract(r' ([A-Za-z]+)\.')

        title_mapping = {
            "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master",
            "Dr": "RareTitle", "Rev": "RareTitle", "Col": "RareTitle", "Major": "RareTitle", "Capt": "RareTitle",
            "Don": "Noble", "Sir": "Noble", "Lady": "Noble", "Countess": "Noble",
            "Mme": "Miss", "Mlle": "Miss"
        }
        X["Title"] = X["Title"].map(lambda x: title_mapping.get(x, "RareTitle"))
        return X

class AgeFareBinner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Age bins
        age_bins = [-1, 12, 18, 35, 60, 100]
        age_labels = ["Child", "Teen", "YoungAdult", "Adult", "Senior"]
        X["AgeGroup"] = pd.cut(X["Age"], bins=age_bins, labels=age_labels).astype("category").cat.codes

        # Fare bins
        fare_bins = [-1, 1.5, 3, 4.5, 6.5]
        fare_labels = ["Low", "Medium", "High", "VeryHigh"]
        X["FareGroup"] = pd.cut(X["Fare"], bins=fare_bins, labels=fare_labels).astype("category").cat.codes

        return X.drop(columns=["Age", "Fare"])

preprocessor = Pipeline([
    ("title_extractor", TitleExtractor()), 
    ("column_transform", ColumnTransformer([
        ("drop_columns", "drop", ["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"]),
        ("log_fare", FunctionTransformer(np.log1p, validate=False), ["Fare"]),
        ("impute_age", KNNImputer(n_neighbors=5), ["Age"]),
        ("binning", AgeFareBinner(), ["Age", "Fare"]),
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"), ["Title", "Sex"])  
    ], remainder="passthrough"))
])

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, random_state=40))
])

# Hyperparameter tuning
pipeline.set_params(
    classifier__n_estimators=200,     
    classifier__max_depth=10,         
    classifier__min_samples_split=10,  
    classifier__min_samples_leaf=4,   
    classifier__max_features="sqrt",  
    classifier__random_state=42     
)


In [16]:
titanic_train = load_titanic_data_train()

titanic_train_X = titanic_train.drop("Survived", axis=1)
titanic_train_y = titanic_train["Survived"].copy()

In [17]:
pipeline.fit(titanic_train_X, titanic_train_y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [18]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipeline, titanic_train_X, titanic_train_y, cv=3, scoring="accuracy")

array([0.8013468 , 0.86195286, 0.81144781])

In [19]:
titanic_test = load_titanic_data_test()

titanic_pred_y = pipeline.predict(titanic_test)

output_df = pd.DataFrame({
    "PassengerId": titanic_test["PassengerId"],  
    "Survived": titanic_pred_y
})

output_df.to_csv("titanic_predictions_main.csv", index=False)

In [20]:
# Grid search for hyperparameter tuning

# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "classifier__n_estimators": [100, 200, 300],
#     "classifier__max_depth": [5, 10, 15, None],
#     "classifier__min_samples_split": [2, 5, 10],
#     "classifier__min_samples_leaf": [1, 2, 4],
#     "classifier__max_features": ["sqrt", "log2"]
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
# grid_search.fit(titanic_train_X, titanic_train_y)

# # Print the best parameters and accuracy
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)


In [21]:
import dill

with open('pipeline.pkl', 'wb') as f:
    dill.dump(pipeline, f)