# Titanic - Machine Learning from Disaster

# Model Building and Evaluation

This notebook focuses on building machine learning models to predict passenger survival on the Titanic based on insights derived from the EDA.
The workflow includes data preprocessing, pipeline creation, model training, evaluation, and selection of the best-performing model.

**Steps Covered:**
- Data preprocessing and feature engineering
- Train-test split
- Model training and hyperparameter tuning
- Model evaluation 
- Saving the final model for deployment

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [4]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
# CREATE CUSTOM TRANSFORMER
class TitleExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        X['Title'] = X['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})
        rare_titles = ['Dr','Rev','Col','Major','Don','Lady','Sir','Capt','Countess','Jonkheer','Dona']
        X['Title'] = X['Title'].replace(rare_titles, 'Rare')
        X = X.drop(columns=['Name'])
        return X

class FamilySizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        X = X.drop(columns=['SibSp','Parch'])
        X['FamilySize'] = pd.cut(X['FamilySize'], bins=[0,1,4,20], labels=['Alone','Small','Large'])
        return X

class FareLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['Fare'] = np.log1p(X['Fare'])
        return X


  X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [24]:
# BUILD A PREPROCESSING PIPELINE
# Columns
numeric_cols = ['Age','Fare']
categorical_cols = ['Sex','Embarked','Title','FamilySize']

# Preprocessing for numeric columns
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical columns
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',drop='first'))
])

# Combine everything
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [25]:
# FULL PIPELINE WITH A LOGISTIC REGRESSION MODEL
pipeline = Pipeline([
    ('title', TitleExtractor()),
    ('family', FamilySizeTransformer()),
    ('fare_log', FareLogTransformer()),
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=1000)) 
])

X = data.drop('Survived', axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate
print("Train Accuracy:", pipeline.score(X_train, y_train))
print("Test Accuracy:", pipeline.score(X_test, y_test))


Train Accuracy: 0.8314606741573034
Test Accuracy: 0.8156424581005587


In [26]:
import joblib 
joblib.dump(pipeline,"titanic_model.pkl")
print("Pipeline saved as titanic_model.pkl")

Pipeline saved as titanic_model.pkl


In [27]:
# Loading the saved pipeline
loaded_data = joblib.load("titanic_model.pkl")

In [28]:
test_df = pd.read_csv('test.csv')
test_df.head()

In [29]:
passenger_ids = test_df["PassengerId"]
# dropping unncessary columns
test_X = test_df.drop(columns=['PassengerId','Ticket','Name'])
test_X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [None]:
# predict
predictions = loaded_data.predict(test_df)
# Creating the submission
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": predictions
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created")

✅ submission.csv created
