<a href="https://colab.research.google.com/github/indracharan-png/Titanic-Machine-Learning-from-Disaster-kaggle-/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import regex as re
import numpy as np
import random
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


RANDOM_STATE = 42

# 1. Data cleaning and preprocessing

def preprocess_data(df):
    # Convert the column names from camelCase to snake_case
    df.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', col_name).lower() for col_name in df.columns]

    df = df.copy()

    # Impute the 'age' feature using stratified random sampling based on 'pclass' and 'sex' features
    for (pclass, sex), group in df.groupby(['pclass', 'sex']):
        # Select indices in a group where 'age' is null
        missing_indices = group['age'].index[group['age'].isna()]

        missing_size = len(missing_indices)


        # Compute the donor data points from the current group
        donors = group['age'].dropna().values

        # Falback-1: If no donors are avaialble in the group, compute donors from other groups
        if donors.size == 0:
            donors = df.loc[(df['sex'] == sex) & (df['age'].notna()), 'age'].values

        # Fallback-2: Compute the donors from the global age data coloumn
        if donors.size == 0:
            donors = df.loc[df['age'].notna(), 'age'].values


        sampled_ages = np.random.choice(donors, size = missing_size, replace=True)
        df.loc[missing_indices, 'age'] = sampled_ages




    # Fill up the empty cells in 'cabin' feature with 'Unknown' string, which indicates missing information
    df['cabin'] = df['cabin'].fillna('Unknown')
    # Extract the deck information from the first letter of the 'cabin' feature, and create a new feature 'deck' ('U' for 'Unknown')
    df['deck'] = df['cabin'].str[0]

    # Built a 'title' feature from 'name' feature
    df['title'] = df['name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip().str.lower()

    # Group titles into few buckets
    title_grouping_map = {}
    for title in df['title'].unique():
        if title not in ['mr', 'mrs', 'miss', 'master']:
            title_grouping_map[title] = 'rare'
        else:
            title_grouping_map[title] = title
    df['title'].replace(title_grouping_map)
    df['title'] = df['title'].fillna('Unknown')

    # Compute family size if exists for each person
    df['family_size'] = df['sib_sp'] + df['parch'] + 1
    df['is_alone'] = (df['family_size'] == 1).astype(int)

    # Extract the ticket prefix info
    df['ticket_prefix'] = (
        df['ticket'].astype(str)
        .str.replace(r'[^A-Za-z0-9]', ' ', regex=True)
        .str.split().str[0]
        .fillna('UNK')
    )

    return df


# 2. Load & Split

df = pd.read_csv('train.csv')
df = preprocess_data(df)

y = df['survived']
print(len(df[(df['survived'] == 1)]) / len(df))
X = df.drop(columns=['survived'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state = RANDOM_STATE
    )


# 3. Column groups

numeric_features = ['pclass', 'age', 'sib_sp', 'parch', 'family_size', 'is_alone']
categorical_features = ['sex', 'deck', 'deck', 'title', 'ticket_prefix']


# 4. Preprocessors

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler(with_mean=True, with_std=True))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


# Two pipelines (LogReg and RandomForest)

logreg_pipeline = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE))
])

randfor_pipeline = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=1, class_weight='balanced'))
])


# Cross-Validation model selection

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def cv_scores(model_pipeline, X, y):
    acc = cross_val_score(model_pipeline, X, y, cv=cv, scoring='accuracy').mean()
    f1 = cross_val_score(model_pipeline, X, y, cv = cv, scoring='f1').mean()
    roc_auc = cross_val_score(model_pipeline, X, y, cv=cv, scoring='roc_auc').mean()
    return {'accuracy': acc, 'f1' : f1, 'roc_auc' : roc_auc}

print(f"LogReg CV: {cv_scores(logreg_pipeline, X_train, y_train)}")
print(f"RandFor CV: {cv_scores(randfor_pipeline, X_train, y_train)}")







df = pd.read_csv('train.csv')
print(df.info())
# print(df.describe(include='all'))
print(df['Pclass'].unique())
print(df['Cabin'].head())
df = preprocess_data(df)
print(df.info())
print(df['pclass'].head())



0.3838383838383838
LogReg CV: {'accuracy': np.float64(0.8233082706766919), 'f1': np.float64(0.7770600645797454), 'roc_auc': np.float64(0.8874926092184652)}
RandFor CV: {'accuracy': np.float64(0.8248456963303781), 'f1': np.float64(0.7663390336669522), 'roc_auc': np.float64(0.873529206773014)}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float6