# Titanic - Machine Learning from Disaster
## Problem Statement
Predict whether a passenger survived the Titanic shipwreck based on features like age, gender, class, and more.

# Import Libraries and Set Up

In [None]:
# Import essential libraries for data handling, visualization, and modeling
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
# List all files in Kaggle input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load and Prepare Data

In [None]:
# Load training and test datasets
train = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster-dataset/train.csv')
test = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster-dataset/test.csv')

# Tagging train and test for later separation after feature engineering
train['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.nan

# Combine datasets for unified preprocessing
all_data = pd.concat([train, test])

# EDA (Exploratory Data Analysis)

In [None]:
# Look at the data types & null counts 
train.info()

In [None]:
# To better understand the numeric data, we want to use the .describe() method. 
# This gives us an understanding of the central tendencies of the data 
train.describe()

In [None]:
train.head()

In [None]:
# Quick way to separate numeric columns
train.describe().columns

In [None]:
# Separate numeric and categorical features
df_num = train[['Age','SibSp','Parch','Fare']]
df_cat = train[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

In [None]:
# Plot distributions for numeric features
for col in df_num.columns:
    plt.hist(df_num[col], bins=20)
    plt.title(col)
    plt.show()

In [None]:
# Correlation matrix
print(df_num.corr())
sns.heatmap(df_num.corr(), annot=True)

In [None]:
# Survival rates across numeric features
pd.pivot_table(train, index='Survived', values=['Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
# Barplots for categorical features
for col in df_cat.columns:
    sns.barplot(x=df_cat[col].value_counts().index, y=df_cat[col].value_counts().values)
    plt.title(col)
    plt.xticks(rotation=45)
    plt.show()
    

In [None]:
# Survival rates across categorical features
print(pd.pivot_table(train, index='Survived', columns='Pclass', values='Ticket', aggfunc='count'))
print("----------------------------")
print(pd.pivot_table(train, index='Survived', columns='Sex', values='Ticket', aggfunc='count'))
print("----------------------------")
print(pd.pivot_table(train, index='Survived', columns='Embarked', values='Ticket', aggfunc='count'))


# Feature Engineering

Feature Engineering¶
1) Cabin - Simplify cabins (evaluated if cabin letter (cabin_adv) or the purchase of tickets across multiple cabins (cabin_multiple) impacted survival)
2) Tickets - Do different ticket types impact survival rates?
3) Does a person's title relate to survival rates?

In [None]:
# Cabin: multiple cabins
train['cabin_multiple'] = train['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
train['cabin_multiple'].value_counts()

In [None]:
# Survival rates by multiple cabins
pd.pivot_table(train, index='Survived', columns='cabin_multiple', values='Ticket', aggfunc='count')

In [None]:
# Cabin: first letter
train['cabin_adv'] = train['Cabin'].apply(lambda x: str(x)[0])

In [None]:
# Comparing survival rate by cabin
print(train['cabin_adv'].value_counts())
pd.pivot_table(train, index='Survived', columns='cabin_adv', values='Name', aggfunc='count')

In [None]:
# Ticket: numeric or not, ticket prefixes
train['numeric_ticket'] = train['Ticket'].apply(lambda x: 1 if x.isnumeric() else 0)
train['ticket_letters'] = train['Ticket'].apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.', '').replace('/', '').lower() if len(x.split(' ')[:-1]) > 0 else 0)

In [None]:
train['numeric_ticket'].value_counts()

In [None]:
# View all rows in dataframe through scrolling.
pd.set_option("display.max_rows", None)
train['ticket_letters'].value_counts()

In [None]:
# Survival rate: numeric vs non-numeric ticket
pd.pivot_table(train, index='Survived', columns='numeric_ticket', values='Ticket', aggfunc='count')

In [None]:
# Survival rate: ticket prefixes
pd.pivot_table(train, index='Survived', columns='ticket_letters', values='Ticket', aggfunc='count')

In [None]:
# Extract Title from Name (mr., ms., master. etc)
train['name_title'] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
train['name_title'].value_counts()

In [None]:
# Apply feature engineering to all_data
all_data['cabin_multiple'] = all_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
all_data['cabin_adv'] = all_data['Cabin'].apply(lambda x: str(x)[0])
all_data['numeric_ticket'] = all_data['Ticket'].apply(lambda x: 1 if x.isnumeric() else 0)
all_data['ticket_letters'] = all_data['Ticket'].apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.', '').replace('/', '').lower() if len(x.split(' ')[:-1]) > 0 else 0)
all_data['name_title'] = all_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
# Impute missing values
all_data['Age'] = all_data['Age'].fillna(train['Age'].median())
all_data['Fare'] = all_data['Fare'].fillna(train['Fare'].median())
all_data.dropna(subset=['Embarked'], inplace=True)

In [None]:
# Normalizations
all_data['norm_sibsp'] = np.log(all_data['SibSp'] + 1)
all_data['norm_fare'] = np.log(all_data['Fare'] + 1)

In [None]:
# Prepare dummy variables
all_data['Pclass'] = all_data['Pclass'].astype(str)
all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'norm_fare', 'Embarked', 'cabin_adv', 'cabin_multiple', 'numeric_ticket', 'name_title', 'train_test']])

In [None]:
# Re-split into train and test
X_train = all_dummies[all_dummies['train_test'] == 1].drop(['train_test'], axis=1)
X_test = all_dummies[all_dummies['train_test'] == 0].drop(['train_test'], axis=1)
y_train = all_data[all_data['train_test'] == 1]['Survived']

# Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age', 'SibSp', 'Parch', 'norm_fare']] = scale.fit_transform(all_dummies_scaled[['Age', 'SibSp', 'Parch', 'norm_fare']])

X_train_scaled = all_dummies_scaled[all_dummies_scaled['train_test'] == 1].drop(['train_test'], axis=1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled['train_test'] == 0].drop(['train_test'], axis=1)

 # Baseline Models

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
def evaluate_model(model, X, y, name="Model", scaled=True):
    cv_scores = cross_val_score(model, X, y, cv=5)
    scaled_text = " (scaled)" if scaled else ""
    print(f"{name} CV Mean Accuracy{scaled_text}: {cv_scores.mean():.4f}")

In [None]:
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": tree.DecisionTreeClassifier(random_state=1),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=1),
    "SVC": SVC(probability=True),
    "XGBoost": XGBClassifier(random_state=1)
}

In [None]:
for name, model in models.items():
    evaluate_model(model, X_train_scaled, y_train, name, False)
    evaluate_model(model, X_train, y_train, name, True)

# Voting Classifier

In [None]:
# A Voting Classifier combines predictions from multiple models.
# 
# - "Hard" voting: Each model votes for a class ("yes" or "no"), and the final prediction is based on majority vote.
#   (Tip: Use an odd number of models to reduce the chance of a tie.)
#
# - "Soft" voting: Each model provides a probability (confidence) for each class.
#   The final prediction is based on the average probabilities — if the average confidence for class 1 exceeds 50%, the output is 1.

from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[(name.lower().replace(' ', '_'), model) for name, model in models.items()],
    voting='soft'
)

evaluate_model(voting_clf, X_train_scaled, y_train, "Voting Classifier", False)
evaluate_model(voting_clf, X_train, y_train, "Voting Classifier", True)

In [None]:
voting_clf.fit(X_train_scaled, y_train)
y_hat_base_vc = voting_clf.predict(X_test_scaled).astype(int)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_hat_base_vc
})
submission.to_csv('base_submission.csv', index=False)

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 
from scipy.stats import uniform, randint

In [None]:
# Helper function for reporting
def clf_performance(classifier, model_name):
    print(model_name)
    print(f'Best Score: {classifier.best_score_:.4f}')
    print(f'Best Parameters: {classifier.best_params_}')

In [None]:
# Logistic Regression tuning
param_grid = {
    'max_iter': [2000],
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear']
}

clf_lr = GridSearchCV(LogisticRegression(), param_grid, cv=5, verbose=True, n_jobs=-1)
best_clf_lr = clf_lr.fit(X_train_scaled, y_train)
clf_performance(best_clf_lr, 'Logistic Regression')

In [None]:
# KNN tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2]
}
clf_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, verbose=True, n_jobs=-1)
best_clf_knn = clf_knn.fit(X_train_scaled, y_train)
clf_performance(best_clf_knn, 'KNN')


In [None]:
# SVC tuning
# Define parameter distributions (wider ranges, but sampled randomly)
param_distributions = [
    {
        'kernel': ['rbf'],
        'gamma': uniform(0.1, 5),  # Uniform sampling between 0.1 and 5
        'C': uniform(0.1, 100)     # Uniform sampling between 0.1 and 100
    },
    {
        'kernel': ['linear'],
        'C': uniform(0.1, 100)
    },
    {
        'kernel': ['poly'],
        'degree': randint(2, 4),   # Only degree 2 or 3
        'C': uniform(0.1, 100)
    }
]

# Randomized Search instead of Grid Search
clf_svc = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions,
    n_iter=30,              # Try 30 random combinations
    cv=3,                   # Use 3-fold CV to speed up
    verbose=2,
    n_jobs=-1,
    random_state=42
)

best_clf_svc = clf_svc.fit(X_train_scaled, y_train)
clf_performance(best_clf_svc, 'SVC (Randomized)')

In [None]:
# Due to the large feature space, I first used a randomized search to quickly explore a wide range of hyperparameters.
# After identifying a promising set of parameters, I performed a more focused, granular search to fine-tune the model.

# rf = RandomForestClassifier(random_state = 1)
# param_grid =  {'n_estimators': [100,500,1000], 
#                                   'bootstrap': [True,False],
#                                   'max_depth': [3,5,10,20,50,75,100,None],
#                                   'max_features': ['auto','sqrt'],
#                                   'min_samples_leaf': [1,2,4,10],
#                                   'min_samples_split': [2,5,10]}
                                  
# clf_rf_rnd = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1)
# best_clf_rf_rnd = clf_rf_rnd.fit(X_train_scaled,y_train)
# clf_performance(best_clf_rf_rnd,'Random Forest')

In [None]:
# Random Forest fine-tuning
param_grid = {
    'n_estimators': [400, 450, 500, 550],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True],
    'max_depth': [15, 20, 25],
    'max_features': ['sqrt', 10],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [2, 3]
}

clf_rf = GridSearchCV(RandomForestClassifier(random_state=1), param_grid, cv=5, verbose=True, n_jobs=-1)
best_clf_rf = clf_rf.fit(X_train_scaled, y_train)
clf_performance(best_clf_rf, 'Random Forest')

In [None]:
# Feature importances
best_rf = best_clf_rf.best_estimator_
feat_importances = pd.Series(best_rf.feature_importances_, index=X_train_scaled.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.title('Top 20 Feature Importances - Random Forest') 
plt.show()

In [None]:
# Due to the large feature space, I first used a randomized search to quickly explore a wide range of hyperparameters.
# After identifying a promising set of parameters, I performed a more focused, granular search to fine-tune the model.

# xgb = XGBClassifier(random_state = 1)

# param_grid = {
#     'n_estimators': [20, 50, 100, 250, 500,1000],
#     'colsample_bytree': [0.2, 0.5, 0.7, 0.8, 1],
#     'max_depth': [2, 5, 10, 15, 20, 25, None],
#     'reg_alpha': [0, 0.5, 1],
#     'reg_lambda': [1, 1.5, 2],
#     'subsample': [0.5,0.6,0.7, 0.8, 0.9],
#     'learning_rate':[.01,0.1,0.2,0.3,0.5, 0.7, 0.9],
#     'gamma':[0,.01,.1,1,10,100],
#     'min_child_weight':[0,.01,0.1,1,10,100],
#     'sampling_method': ['uniform', 'gradient_based']
# }

# clf_xgb_rnd = RandomizedSearchCV(xgb, param_distributions = param_grid, n_iter = 1000, cv = 5, verbose = True, n_jobs = -1)
# best_clf_xgb_rnd = clf_xgb_rnd.fit(X_train_scaled,y_train)
# clf_performance(best_clf_xgb_rnd,'XGB')

In [None]:
xgb = XGBClassifier(random_state=1)

param_grid = { 
    'n_estimators': [50, 100, 250, 500], 
    'colsample_bytree': [0.5, 0.7, 0.8, 1], 
    'max_depth': [3, 5, 7, 10], 
    'learning_rate': [0.01, 0.05, 0.1, 0.3], 
    'subsample': [0.5, 0.7, 0.8, 1] 
}

clf_xgb = GridSearchCV(xgb, param_grid=param_grid, cv=5, verbose=True, n_jobs=-1) 
best_clf_xgb = clf_xgb.fit(X_train_scaled, y_train) 
clf_performance(best_clf_xgb, 'XGBoost')

In [None]:
# Feature importance for XGBoost
best_xgb = best_clf_xgb.best_estimator_.fit(X_train_scaled, y_train) 
xgb_feat_importances = pd.Series(best_xgb.feature_importances_, index=X_train_scaled.columns) 
xgb_feat_importances.nlargest(20).plot(kind='barh') 
plt.title('Top 20 Feature Importances - XGBoost') 
plt.show()

# Final Ensemble Model
Now that we've tuned individual models, let's create an ensemble based on the best models.

In [None]:
# Extract best estimators from grid search results
best_estimators = {
    'lr': best_clf_lr.best_estimator_,
    'knn': best_clf_knn.best_estimator_,
    'svc': best_clf_svc.best_estimator_,
    'rf': best_clf_rf.best_estimator_,
    'xgb': best_clf_xgb.best_estimator_
}

# Define different voting classifiers
voting_clf_hard = VotingClassifier(
    estimators=[('knn', best_estimators['knn']), ('rf', best_estimators['rf']), ('svc', best_estimators['svc'])],
    voting='hard'
)

voting_clf_soft = VotingClassifier(
    estimators=[('knn', best_estimators['knn']), ('rf', best_estimators['rf']), ('svc', best_estimators['svc'])],
    voting='soft'
)

voting_clf_all = VotingClassifier(
    estimators=[('knn', best_estimators['knn']), ('rf', best_estimators['rf']), 
                ('svc', best_estimators['svc']), ('lr', best_estimators['lr'])],
    voting='soft'
)

voting_clf_xgb = VotingClassifier(
    estimators=[('knn', best_estimators['knn']), ('rf', best_estimators['rf']), 
                ('svc', best_estimators['svc']), ('xgb', best_estimators['xgb']), ('lr', best_estimators['lr'])],
    voting='soft'
)

# Evaluate classifiers
for name, clf in {
    'Voting Classifier (Hard)': voting_clf_hard,
    'Voting Classifier (Soft)': voting_clf_soft,
    'Voting Classifier (Soft + LR)': voting_clf_all,
    'Voting Classifier (Soft + XGB + LR)': voting_clf_xgb
}.items():
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f"{name} CV Scores: {scores}")
    print(f"{name} Mean CV Accuracy: {scores.mean():.4f}\n")

In [None]:
# Soft voting allows weighting classifiers differently. 
# Perform grid search to find optimal weights (no new models trained here).
weight_options = {'weights': [[1,1,1], [1,2,1], [1,1,2], [2,1,1], [2,2,1], [1,2,2], [2,1,2]]}

vote_weight_search = GridSearchCV(
    estimator=voting_clf_soft,
    param_grid=weight_options,
    cv=5,
    verbose=2,
    n_jobs=-1
)

best_weighted_clf = vote_weight_search.fit(X_train_scaled, y_train)
clf_performance(best_weighted_clf, 'Voting Classifier (Weighted)')
voting_clf_sub_predictions = best_weighted_clf.best_estimator_.predict(X_test_scaled)

In [None]:
# Fit all voting classifiers and best random forest
models_to_fit = [voting_clf_hard, voting_clf_soft, voting_clf_all, voting_clf_xgb, best_estimators['rf']]

for model in models_to_fit:
    model.fit(X_train_scaled, y_train)

# Generate predictions
y_hat = {
    'vc_hard': voting_clf_hard.predict(X_test_scaled).astype(int),
    'vc_soft': voting_clf_soft.predict(X_test_scaled).astype(int),
    'vc_all': voting_clf_all.predict(X_test_scaled).astype(int),
    'vc_xgb': voting_clf_xgb.predict(X_test_scaled).astype(int),
    'rf': best_estimators['rf'].predict(X_test_scaled).astype(int)
}

In [None]:
# Prepare submission DataFrames
submissions = {
    'submission_rf': pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_hat['rf']}),
    'submission_vc_hard': pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_hat['vc_hard']}),
    'submission_vc_soft': pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_hat['vc_soft']}),
    'submission_vc_all': pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_hat['vc_all']}),
    'submission_vc_xgb': pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_hat['vc_xgb']})
}

# Combined comparison DataFrame
comparison = pd.DataFrame({
    'PassengerId': test.PassengerId,
    'Survived_vc_hard': y_hat['vc_hard'],
    'Survived_rf': y_hat['rf'],
    'Survived_vc_soft': y_hat['vc_soft'],
    'Survived_vc_all': y_hat['vc_all'],
    'Survived_vc_xgb': y_hat['vc_xgb']
})


In [None]:
# Compare predictions between models
comparison['diff_rf_vs_vc_hard'] = (comparison['Survived_rf'] != comparison['Survived_vc_hard']).astype(int)
comparison['diff_vc_soft_vs_hard'] = (comparison['Survived_vc_soft'] != comparison['Survived_vc_hard']).astype(int)
comparison['diff_vc_all_vs_hard'] = (comparison['Survived_vc_all'] != comparison['Survived_vc_hard']).astype(int)

In [None]:
# Show differences count
comparison['diff_vc_all_vs_hard'].value_counts()

In [None]:
# Save submissions
for name, df in submissions.items():
    df.to_csv(f'{name}.csv', index=False)

# Conclusion
- Performed extensive EDA and feature engineering.
- Created multiple models including Logistic Regression, KNN, SVC, Random Forest, XGBoost.
- Tuned hyperparameters using GridSearchCV.
- Built an ensemble soft-voting classifier combining the best models.
- Generated final submission ready for Kaggle.