In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic-data/train.csv
/kaggle/input/titanic-data/test.csv


In [7]:
import pandas as pd
train = pd.read_csv('/kaggle/input/titanic-data/train.csv' ,index_col='PassengerId')
test = pd.read_csv('/kaggle/input/titanic-data/test.csv' ,index_col='PassengerId')


In [8]:
df = train.copy()

In [9]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Ms': 'Miss', 'Lady': 'Rare', 'Sir': 'Rare',
        'Mme': 'Mrs', 'Don': 'Rare', 'Capt': 'Rare', 'Countess': 'Rare',
        'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
df['Title'].value_counts()

Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64

In [11]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['ischild'] = ((df['Age'] < 18) & df['Age'].notna()).astype(int)
df['ischild'].value_counts()

ischild
0    778
1    113
Name: count, dtype: int64

In [12]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['FarePerPerson'] = (df['Fare']/ df['FamilySize'])
df['IsAlone'].value_counts()


IsAlone
1    537
0    354
Name: count, dtype: int64

In [13]:
df['HasCabin'] = df['Cabin'].notna().astype(int)
df['Deck'] = df['Cabin'].str[0].fillna('Unknown')

deck_counts = df['Deck'].value_counts()
df['Deck'] = df['Deck'].replace(deck_counts[deck_counts < 20].index, 'Rare')

In [14]:
df['Deck'].value_counts()


Deck
Unknown    687
C           59
B           47
Rare        33
D           33
E           32
Name: count, dtype: int64

In [15]:
df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)


In [16]:
X = df.drop('Survived', axis=1) 
y = df['Survived'] 

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [20]:
X.head

<bound method NDFrame.head of              Pclass     Sex   Age  SibSp  Parch     Fare Embarked Title  \
PassengerId                                                               
1                 3    male  22.0      1      0   7.2500        S    Mr   
2                 1  female  38.0      1      0  71.2833        C   Mrs   
3                 3  female  26.0      0      0   7.9250        S  Miss   
4                 1  female  35.0      1      0  53.1000        S   Mrs   
5                 3    male  35.0      0      0   8.0500        S    Mr   
...             ...     ...   ...    ...    ...      ...      ...   ...   
887               2    male  27.0      0      0  13.0000        S  Rare   
888               1  female  19.0      0      0  30.0000        S  Miss   
889               3  female  28.0      1      2  23.4500        S  Miss   
890               1    male  26.0      0      0  30.0000        C    Mr   
891               3    male  32.0      0      0   7.7500        Q    M

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

numeric_cols= X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols= X_train.select_dtypes(include=['object','category']).columns.tolist()

num_transformer= Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

cat_transformer= Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor= ColumnTransformer([
    ('num',num_transformer,numeric_cols),
    ('cat',cat_transformer,cat_cols)
])



In [22]:
model1= Pipeline([
    ('preprocessor',preprocessor),
    ('lrs',LogisticRegression(max_iter=1000))
])

model2= Pipeline([
    ('preprocessor',preprocessor),
    ('xgb', XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    random_state=40,
    colsample_bytree= 0.8,
    subsample= 0.8,
    max_depth = 3
))
])

model3= Pipeline([
    ('preprocessor',preprocessor),
    ('rfc',RandomForestClassifier(n_estimators = 100,
                                  max_depth = None,
                                  min_samples_split = 2))
])

In [23]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
param_grids = {
    'Logistic Regression': {
        'lrs__C': [0.01, 0.1, 1, 10],
        'lrs__penalty': ['l2']
    },
    'XGBoost': {
        'xgb__max_depth': [3, 5, 7],
        'xgb__subsample': [0.8, 1.0],
        'xgb__colsample_bytree': [0.8, 1.0]
    },
    'Random Forest': {
        'rfc__n_estimators': [100, 200],
        'rfc__max_depth': [10, 20, None],
        'rfc__min_samples_split': [2, 5, 10]
    }
}


In [24]:
models = {
    'Logistic Regression': model1,
    'XGBoost': model2,
    'Random Forest': model3
}
best_models = {}
for name, m in models.items():
    grid_search = GridSearchCV(
        m, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    best_models[name] = grid_search.best_estimator_
    
    print(f"{name} Best Params: {grid_search.best_params_}")
    print(f"{name} Best score: {grid_search.best_score_:.4f}")

Logistic Regression Best Params: {'lrs__C': 1, 'lrs__penalty': 'l2'}
Logistic Regression Best score: 0.8217
XGBoost Best Params: {'xgb__colsample_bytree': 0.8, 'xgb__max_depth': 3, 'xgb__subsample': 0.8}
XGBoost Best score: 0.8161
Random Forest Best Params: {'rfc__max_depth': 20, 'rfc__min_samples_split': 10, 'rfc__n_estimators': 200}
Random Forest Best score: 0.8358


In [25]:
results = []

for name, model_pipeline in best_models.items():
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1 = report['weighted avg']['f1-score']
    
    results.append({
        'Model': name,
        'Accuracy': f"{accuracy:.4f}",
        'Precision': f"{precision:.4f}",
        'Recall': f"{recall:.4f}",
        'F1-Score': f"{f1:.4f}"
    })
    
    print(f"\n{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")



Logistic Regression:
  Accuracy: 0.8380
  Precision: 0.8369
  Recall: 0.8380
  F1-Score: 0.8367

XGBoost:
  Accuracy: 0.8212
  Precision: 0.8198
  Recall: 0.8212
  F1-Score: 0.8195

Random Forest:
  Accuracy: 0.8045
  Precision: 0.8029
  Recall: 0.8045
  F1-Score: 0.8015
