# Titanic Survival Predictions 

The Objective of this competition was to predict whether passengers aboard the Titanic survived or not based on a set of features

In [3]:
# Load Required Libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [4]:
# Load Data
train_df = pd.read_csv("titanic_train.csv")
test_df = pd.read_csv("titanic_test.csv")

In [5]:
# View features
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# High Level view of data 
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Feature Engineering
1. Names will be dropped but would be interesting to keep titles (rare titles may correspond to higher survivial)
2. No need to have siblings and parents in two seperate features -> Combine these
3. Letter in Cabin is more informative about level on the ship
4. Use the first part of the ticket number

In [9]:
# Extract Title from Name using Regex
train_df["Title"] = train_df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
test_df["Title"] = test_df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)

  train_df["Title"] = train_df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
  test_df["Title"] = test_df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)


In [10]:
# Replace titles 'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' with Rare
test_df["Title"].unique(), train_df["Title"].unique()

(array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
       dtype=object),
 array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
        'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
        'Jonkheer'], dtype=object))

In [11]:
# Simplify Rare titles
for df in [train_df, test_df]:
    df["Title"] = df["Title"].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev',
         'Sir', 'Jonkheer', 'Dona'], 'Rare'
    )
    df["Title"] = df["Title"].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# Combine SibSp + Parch → FamilySize
for df in [train_df, test_df]:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

# Extract Cabin Letter
for df in [train_df, test_df]:
    df["CabinLetter"] = df["Cabin"].astype(str).str[0]
    df["CabinLetter"] = df["CabinLetter"].replace('n', np.nan)

# Extract Ticket Prefix
for df in [train_df, test_df]:
    df["TicketPrefix"] = df["Ticket"].apply(lambda x: x.split()[0] if not x.split()[0].isdigit() else 'None')

### Handle Missing Values

In [13]:
# Fill Embarked with mode as only two values missing
for df in [train_df, test_df]:
    df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)

# Fill missing Fare in test set with median of same Pclass
test_df["Fare"].fillna(test_df.groupby("Pclass")["Fare"].transform("median"), inplace=True)

# Group-based imputation for Age (by Sex & Pclass) as 20% of Age values missing - too many to drop
for df in [train_df, test_df]:
    df["Age"] = df.groupby(["Sex", "Pclass"])["Age"].transform(lambda x: x.fillna(x.median()))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Fare"].fillna(test_df.groupby("Pclass")["Fare"].transform("median"), inplace=True)


In [14]:
# Drop Redundant Features
drop_cols = ["PassengerId", "Name", "Ticket", "Cabin", "SibSp", "Parch"]
train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

### Create the Model and fit

In [16]:
# Split Features/Target
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]

# Define Feature Types
categorical_features = ["Sex", "Embarked", "Title", "CabinLetter", "TicketPrefix"]
numeric_features = ["Age", "Fare", "Pclass", "FamilySize"]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Define Base Learners
base_model = [
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
    ("gb", GradientBoostingClassifier(n_estimators=200, random_state=42)),
    ("svc", SVC(probability=True, random_state=42))
]

# Define Stacking Classifier
stacking_model = StackingClassifier(
    estimators=base_model,
    final_estimator=LogisticRegression(max_iter=10000),
    cv=5
)

# Build Full Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),     # encodes categorical features
    ("model", stacking_model)
])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Model
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimators,"[('rf', ...), ('gb', ...), ...]"
,final_estimator,LogisticRegre...ax_iter=10000)
,cv,5
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


### Evaluate Model and Generate Predictions for Test Data

In [18]:
# Evaluate
y_hat = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_hat))
print("Precision:", precision_score(y_test, y_hat))
print("Recall:", recall_score(y_test, y_hat))
print("F1 Score:", f1_score(y_test, y_hat))

# Predict on Test Data
test_pred = model.predict(test_df)

# Save predictions with PassengerId
submission = pd.DataFrame({
    "PassengerId": pd.read_csv("titanic_test.csv")["PassengerId"],
    "Survived": test_pred})

submission.to_csv("titanic_submission.csv", index=False)

Accuracy: 0.8324022346368715
Precision: 0.7894736842105263
Recall: 0.8108108108108109
F1 Score: 0.8
