In [19]:
    
# Titanic Survival Prediction - Advanced Pipeline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb

# Load Data
train = pd.read_csv(r'C:\Users\HP\Downloads\train.csv')
test = pd.read_csv(r'C:\Users\HP\Downloads\test (1).csv')
test_passenger_ids = test['PassengerId']

# Combine train & test for uniform preprocessing
data = pd.concat([train, test], sort=False)

# Feature Engineering
# Extract Title
data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace(['Mme'], 'Mrs')
data['Title'] = data['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev', 'Jonkheer', 'Don', 'Sir', 'the Countess', 'Lady', 'Dona'], 'Rare')

# Fill missing Age based on Title median
data['Age'] = data.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

# Fill missing Embarked with mode
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Fill missing Fare with median
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

# Cabin Feature: Missing or Not
data['CabinKnown'] = data['Cabin'].notnull().astype(int)

# Family size
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

# Encode categorical variables
label_cols = ['Sex', 'Embarked', 'Title']
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

# Drop unneeded columns
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
data.drop(columns=drop_cols, inplace=True)

# Split back into train and test
train_cleaned = data[:len(train)]
test_cleaned = data[len(train):]
X = train_cleaned.drop(columns='Survived')
y = train_cleaned['Survived']
X_test = test_cleaned.drop(columns='Survived')

# Feature Scaling (important for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Split training data for local validation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model: XGBoost (high performance)
model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Validation
y_pred_val = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print(confusion_matrix(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

# Cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5)
print("Cross-validation Accuracy: %.4f ± %.4f" % (cv_scores.mean(), cv_scores.std()))

# Predict on test set
test_predictions = model.predict(X_test_scaled)






Validation Accuracy: 0.8156424581005587
[[90 15]
 [18 56]]
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.85       105
         1.0       0.79      0.76      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.81      0.82      0.82       179

Cross-validation Accuracy: 0.8283 ± 0.0229
