In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load and combine the DATA

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd

# Load datasets
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Save PassengerId for final submission
test_passenger_ids = test['PassengerId']

# Add Survived column to test (for concatenation)
test['Survived'] = np.nan

# Combine datasets
full_data = pd.concat([train, test], sort=False).reset_index(drop=True)

# Preview
full_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Features to create:
Title from name (Mr, Mrs, etc.)  

FamilySize (SibSp + Parch + 1) 

IsAlone 

Deck from Cabin 

TicketGroupSize from duplicate ticket counts 

Binned Fare and Age 

Interaction: Sex * Pclass 



In [3]:
# Title from Name
full_data['Title'] = full_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
full_data['Title'] = full_data['Title'].replace(['Lady', 'Countess','Capt','Col','Don', 'Dr', 
                                                 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
full_data['Title'] = full_data['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# Family Size and IsAlone
full_data['FamilySize'] = full_data['SibSp'] + full_data['Parch'] + 1
full_data['IsAlone'] = (full_data['FamilySize'] == 1).astype(int)

# Deck from Cabin
full_data['Deck'] = full_data['Cabin'].str[0]
full_data['Deck'] = full_data['Deck'].fillna('U')  # Unknown

# Ticket Group Size (count how many have the same ticket)
ticket_counts = full_data['Ticket'].value_counts()
full_data['TicketGroupSize'] = full_data['Ticket'].map(ticket_counts)

# Fare Binning
full_data['FareBin'] = pd.qcut(full_data['Fare'], 4, labels=False)

# Age Binning (temporarily fill NA with median to bin; we’ll impute better later)
age_median = full_data['Age'].median()
full_data['AgeBin'] = pd.qcut(full_data['Age'].fillna(age_median), 4, labels=False)

# Sex x Pclass interaction
full_data['Sex_Pclass'] = full_data['Sex'].astype(str) + "_" + full_data['Pclass'].astype(str)


Title shows age, social status, and possibly gender.

FamilySize + IsAlone helps capture survival group influence.

Deck relates to cabin position (safety chances).

TicketGroupSize captures hidden groups/families.

Binning handles outliers and non-linear relations.

Sex_Pclass captures socioeconomic differences between classes.

# Preview new features

In [4]:
full_data[['Name', 'Title', 'FamilySize', 'IsAlone', 'Deck', 'TicketGroupSize', 'FareBin', 'AgeBin', 'Sex_Pclass']].head()


Unnamed: 0,Name,Title,FamilySize,IsAlone,Deck,TicketGroupSize,FareBin,AgeBin,Sex_Pclass
0,"Braund, Mr. Owen Harris",Mr,2,0,U,1,0.0,0,male_3
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,2,0,C,2,3.0,3,female_1
2,"Heikkinen, Miss. Laina",Miss,1,1,U,1,1.0,1,female_3
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,2,0,C,2,3.0,2,female_1
4,"Allen, Mr. William Henry",Mr,1,1,U,1,1.0,2,male_3


# Handling Missing Values

Fill Embarked

In [5]:
# Fill Embarked with mode
full_data['Embarked'] = full_data['Embarked'].fillna(full_data['Embarked'].mode()[0])


Fill Fare (with median of Pclass + Embarked group)

In [6]:
# Fill Fare based on Pclass and Embarked groups
full_data['Fare'] = full_data.groupby(['Pclass', 'Embarked'])['Fare'].transform(lambda x: x.fillna(x.median()))


Predict Age using Random Forest Regressor

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Features to predict Age
age_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize']

# Encode categorical features temporarily
age_df = full_data[age_features + ['Age']].copy()
age_df['Sex'] = age_df['Sex'].map({'male': 0, 'female': 1})
age_df['Embarked'] = age_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
age_df['Title'] = age_df['Title'].astype('category').cat.codes

# Split into known and unknown Age
known_age = age_df[age_df['Age'].notnull()]
unknown_age = age_df[age_df['Age'].isnull()]

# Train RF Regressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(known_age.drop('Age', axis=1), known_age['Age'])

# Predict missing Age
predicted_ages = rfr.predict(unknown_age.drop('Age', axis=1))

# Fill Age back
full_data.loc[full_data['Age'].isnull(), 'Age'] = predicted_ages


confirmation

In [8]:
full_data.isnull().sum()


PassengerId           0
Survived            418
Pclass                0
Name                  0
Sex                   0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Cabin              1014
Embarked              0
Title                 0
FamilySize            0
IsAlone               0
Deck                  0
TicketGroupSize       0
FareBin               1
AgeBin                0
Sex_Pclass            0
dtype: int64

**Features Improvement in cabin**

NumCabins: Number of cabinsassigned to a passenger

In [9]:
# Deck Already done:
# full_data['Deck'] = full_data['Cabin'].str[0].fillna('U')
full_data['NumCabins'] = full_data['Cabin'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)

CabinNumber: Extract Cabin numbers and treat them numerically

In [10]:
import re

def extract_cabin_num(cabin):
    if pd.isnull(cabin):
        return 0
    numbers = re.findall(r'(\d+)', cabin)
    return int(numbers[0]) if numbers else 0

full_data['CabinNumber'] = full_data['Cabin'].apply(extract_cabin_num)


Checking the cabin data again

In [11]:
full_data[['Cabin', 'Deck', 'NumCabins', 'CabinNumber']].head(10)


Unnamed: 0,Cabin,Deck,NumCabins,CabinNumber
0,,U,0,0
1,C85,C,1,85
2,,U,0,0
3,C123,C,1,123
4,,U,0,0
5,,U,0,0
6,E46,E,1,46
7,,U,0,0
8,,U,0,0
9,,U,0,0


Lets check for missing values again

In [12]:
full_data['Cabin'].isnull().sum()

1014

That’s expected, we never filled Cabin itself, but instead extracted meaningful proxy features from it.

We’ve already extracted all useful information (Deck, NumCabins, CabinNumber)

Cabin is now redundant

and it’s still 75% missing

so now it's safe and optimal to drop it.

In [13]:
full_data.drop(columns=['Cabin'], inplace=True)


Confirming Missing Values again

In [14]:
full_data.isnull().sum()


PassengerId          0
Survived           418
Pclass               0
Name                 0
Sex                  0
Age                  0
SibSp                0
Parch                0
Ticket               0
Fare                 0
Embarked             0
Title                0
FamilySize           0
IsAlone              0
Deck                 0
TicketGroupSize      0
FareBin              1
AgeBin               0
Sex_Pclass           0
NumCabins            0
CabinNumber          0
dtype: int64

checking for FareBin

In [15]:
full_data[full_data['Fare'].isnull()]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Title,FamilySize,IsAlone,Deck,TicketGroupSize,FareBin,AgeBin,Sex_Pclass,NumCabins,CabinNumber


Lets fill it with Median

In [16]:
full_data['Fare'] = full_data['Fare'].fillna(full_data['Fare'].median())


Re-Check

In [17]:
full_data['FareBin'].isnull().sum()


1

Fare has no missing values, as Step 1 shows 0 rows for missing Fare

Yet FareBin still has 1 missing value, which means qcut() failed to assign a bin to at least one entry (likely a repeated value on the boundary)

Updated FareBin Logic

In [18]:
# Use rank to avoid duplicate bin edges
fare_ranks = full_data['Fare'].rank(method='min')

# Create equal-frequency Fare bins using rank
full_data['FareBin'] = pd.qcut(fare_ranks, 4, labels=False)


Final Check

In [19]:
full_data['FareBin'].isnull().sum()


0

# Encode Categorical Features and Split Train/Test for Modeling

Features to encode:

| Column       | Type        | Method         |
| ------------ | ----------- | -------------- |
| `Sex`        | Binary      | Label encode   |
| `Embarked`   | Nominal     | One-hot encode |
| `Title`      | Categorical | Label encode   |
| `Deck`       | Categorical | Label encode   |
| `Sex_Pclass` | Categorical | Label encode   |


Encode Categorical Features


In [20]:
from sklearn.preprocessing import LabelEncoder

# Label Encode binary/categorical fields
le = LabelEncoder()
for col in ['Sex', 'Title', 'Deck', 'Sex_Pclass']:
    full_data[col] = le.fit_transform(full_data[col])

# One-hot encode Embarked
full_data = pd.get_dummies(full_data, columns=['Embarked'], drop_first=True)


Drop Columns Not Needed for Modeling

In [21]:
drop_cols = ['PassengerId', 'Name', 'Ticket']
full_data.drop(columns=drop_cols, inplace=True)


Separate Train and Test

In [22]:
# Split back into train/test
train_final = full_data[full_data['Survived'].notnull()].copy()
test_final = full_data[full_data['Survived'].isnull()].copy()

# Separate target variable
X = train_final.drop('Survived', axis=1)
y = train_final['Survived'].astype(int)

# For prediction later
X_test = test_final.drop('Survived', axis=1)


Checking Shape and Features

In [23]:
print("Train shape:", X.shape)
print("Test shape:", X_test.shape)
print("Features used:", X.columns.tolist())


Train shape: (891, 18)
Test shape: (418, 18)
Features used: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Title', 'FamilySize', 'IsAlone', 'Deck', 'TicketGroupSize', 'FareBin', 'AgeBin', 'Sex_Pclass', 'NumCabins', 'CabinNumber', 'Embarked_Q', 'Embarked_S']


# Training The MODEL

Goal:

Train XGBoost, LightGBM, and optionally CatBoost

Use cross-validation

Ensemble predictions for final boost

Installing Libraries

In [24]:
!pip install xgboost lightgbm catboost --quiet


Import and Prepare Cross Validation

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize predictions
oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])


# XGBoost MODEL 

In [26]:
for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = xgb.XGBClassifier(
        n_estimators=1000,
        max_depth=4,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        early_stopping_rounds=50
    )

    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=False)


    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits


Evaluate

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

print("OOF Accuracy:", accuracy_score(y, oof_preds))
print(confusion_matrix(y, oof_preds))
print(classification_report(y, oof_preds))


OOF Accuracy: 0.8327721661054994
[[496  53]
 [ 96 246]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       549
           1       0.82      0.72      0.77       342

    accuracy                           0.83       891
   macro avg       0.83      0.81      0.82       891
weighted avg       0.83      0.83      0.83       891



That is a very good accuracy but to reach 84+

lets code for LightGBM and CatBoost

And then ensemble all 3 model predictions (XGB + LGBM + CatBoost)

Importing lib for LGBM and CatBoost

In [28]:
from catboost import CatBoostClassifier
import lightgbm as lgb


# LightGBM Training

In [29]:
from lightgbm import early_stopping, log_evaluation
oof_lgb = np.zeros(X.shape[0])
test_lgb = np.zeros(X_test.shape[0])

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[early_stopping(50), log_evaluation(0)])


    oof_lgb[val_idx] = model.predict(X_val)
    test_lgb += model.predict(X_test) / kf.n_splits


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 321
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[522]	valid_0's binary_logloss: 0.369699
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] T

# CatBoost Training

In [30]:
oof_cat = np.zeros(X.shape[0])
test_cat = np.zeros(X_test.shape[0])

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.01,
        depth=4,
        eval_metric='Accuracy',
        verbose=False,
        random_seed=42
    )

    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=50)

    oof_cat[val_idx] = model.predict(X_val)
    test_cat += model.predict(X_test) / kf.n_splits


# Final Ensemble and Evaluation

In [31]:
# Weighted Ensemble with weights 0.6 (XGB), 0.2 (LGBM), 0.2 (CatBoost)
oof_ensemble = (0.6 * oof_preds) + (0.2 * oof_lgb) + (0.2 * oof_cat)
test_ensemble = (0.6 * test_preds) + (0.2 * test_lgb) + (0.2 * test_cat)

final_oof = np.round(oof_ensemble)
final_test_preds = np.round(test_ensemble)

print("Weighted Ensemble OOF Accuracy:", accuracy_score(y, final_oof))


Weighted Ensemble OOF Accuracy: 0.8327721661054994


# Conclusion

The XGBoost model is the strongest single model.

The ensemble doesn't improve accuracy because other models add very little new information or even slight noise.

Because XGBoost predictions dominate with a weight of 0.6, and LightGBM + CatBoost predictions are somewhat similar but slightly weaker, so the weighted average mostly reflects XGBoost

# Submission

Prepare submission DataFrame

In [32]:
submission_xgb = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': np.round(test_preds).astype(int)
})

Save to CSV

In [33]:
submission_xgb['Survived'] = (test_preds > 0.55).astype(int)


In [34]:
submission_xgb.to_csv('submission.csv', index=False)
