## Introduction

Welcome to my Kaggle Titanic Machine Learning Challenge notebook, a journey into the world of data science and predictive modeling. In this competition, we aim to build a robust machine learning model that predicts the survival of passengers aboard the ill-fated RMS Titanic.

In [85]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Pre-processing

In [86]:
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def preprocess_df(df):
    def tokenize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def clean_and_convert(value):
        if isinstance(value, str):
            if value.strip():  # Check if the string is not empty after stripping whitespace
                # Use regular expression to remove non-numeric characters
                cleaned_value = re.sub(r'[^0-9.]', '', value)
                # Convert the cleaned value to float
                return float(cleaned_value)
            else:
                # Return NaN for empty strings
                return np.nan
        else:
            # If it's not a string, return it as is
            return value


    def ticket_number(x):
        last_part = x.split(" ")[-1]
        numeric_part = re.sub(r'[^0-9.]', '', last_part)  # Allow for decimal point
        if numeric_part == "":
            return float('nan')
        else:
            return float(numeric_part)
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    

    df['Name'] = df['Name'].apply(tokenize_name)
    # Extract surnames from the "Name" column
    df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0].strip())
    
    # Create a new column for family size
    df['Family_Size'] = df.groupby('Surname')['Surname'].transform('count')

    # Add 1 to include the person with that surname itself
    df['Family_Size'] = df['Family_Size'] + 1
    
    # Add ticket number Column
    df["TicketNumber"] = df["Ticket"].apply(ticket_number)

    df["TicketItem"] = df["Ticket"].apply(ticket_item) 


    # Encode the Cabin Type as a Feature and One-hot encode Sex and Embarked columns
    df['CabinType'] = df['Cabin'].str[0]

    df['CabinNumber'] = df['Cabin'].str[1:]
    df['CabinNumber'] = df['CabinNumber'].apply(clean_and_convert)

    # Replace NaN values with 0 in the 'CabinNumber' column
    df['CabinNumber'].fillna(df['CabinNumber'].median(), inplace=True)

    # One-hot encode sex and embarking location of the catgeorical columns
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], dtype=int)
    
    # Also include a column representing if a passeneger has a column
    #df['HasCabin'] = np.where(df['Cabin'].isna(), 0, 1)


    # Age column has several NaN values. Augment these with median age instead of removing
    median_age = df['Age'].median()
    df['Age'].fillna(median_age, inplace=True)


    # One-hot encode the titles which occur three or more times into features
    titles_to_encode = ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev']

    # Create new columns for each title and encode as 1 if the title is present, 0 otherwise
    for title in titles_to_encode:
        df[title] = df['Name'].apply(lambda x: 1 if title in x else 0)

    
    # Drop irrelevant columns
    df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Surname', 'TicketNumber', 'TicketItem'], axis=1, inplace=True)

    # Assuming 'CabinType' is the target column to impute
    target_column = 'CabinType'

    # Create a DataFrame for training set (rows with non-NaN values)
    nn_df = df.dropna(subset=[target_column])

    # Create a DataFrame for the target set (rows with NaN values)
    target_df = df[df[target_column].isna()]

    # Encode 'CabinType' values (assuming they are categorical)
    codes = nn_df['CabinType'].astype('category').cat.codes
    nn_df['CabinType'] = codes

    # Initialize and fit KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=1)  # You can adjust the number of neighbors
    knn.fit(nn_df.drop(columns=[target_column]), nn_df[target_column])

    # Predict 'CabinType' values for the rows with NaN values
    predicted_values = knn.predict(target_df.drop(columns=[target_column]))

    # Update the 'CabinType' values for the rows where it was not NaN
    df.loc[df[target_column].notnull(), target_column] = codes[df[target_column].notnull()]

    # Assign the predicted values to the corresponding rows where 'CabinType' is NaN
    df.loc[df[target_column].isna(), target_column] = predicted_values



    return df

train_df = preprocess_df(data)


# Step 1: Split the data into training and test sets
X = train_df.drop(columns=['Survived'])  # Features (all columns except 'Survived')
y = train_df['Survived']  # Target variable


# Split the data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Apply Standard Scaling
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nn_df['CabinType'] = codes


In [87]:
train_df['CabinType']

0      3
1      2
2      3
3      2
4      3
      ..
886    3
887    1
888    2
889    2
890    3
Name: CabinType, Length: 891, dtype: object

## Train Classification Models

In [88]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),  # Add GBDT
    'XGBoost': XGBClassifier(random_state=42)  # Add XGBoost
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    classifier.fit(X_train_scaled, y_train)  # Train the model
    y_pred = classifier.predict(X_test_scaled)  # Make predictions on the test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f'{name} Accuracy: {accuracy:.3f}')


Logistic Regression Accuracy: 0.810
Decision Tree Accuracy: 0.810


Random Forest Accuracy: 0.804
SVM Accuracy: 0.810
GBDT Accuracy: 0.810
XGBoost Accuracy: 0.793


## Grid Search

In [89]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define a parameter grid for hyperparameter tuning
# Define a parameter grid for GBDT hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],

    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Gradient Boosting model
rf_classifier = RandomForestClassifier(random_state=42)

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=6, scoring='accuracy')

# Fit the GridSearchCV instance on the training data
grid_search.fit(X_scaled, y)

# Retrieve the best hyperparameters and the corresponding best estimator
best_params = grid_search.best_params_
print(best_params)
best_rf_classifier = grid_search.best_estimator_

{'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


## Predict on Unseen Testing Set with Optimised Cross-Validated Model

In [90]:
test_df = pd.read_csv('test.csv')

# Fare column has several NaN values. Augment these with median fare instead of removing
median_fare = test_df['Fare'].median()
test_df['Fare'].fillna(median_fare, inplace=True)

test_df_processed = preprocess_df(test_df)

# Select features and scale unseen data
X_test = test_df_processed.copy()  # Features (all columns except 'Survived')

ordered_cols = X.columns

# Add columns that have been missed during one-hot encdoing due to the test data not containing certain categories etc
missing_cols = set(X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0

# Re-order columns to be in the same order as for the training data
X_test = X_test[ordered_cols]

X_test_scaled = scaler.transform(X_test)

# Evaluate the model with the best hyperparameters on the test set
preds = best_rf_classifier.predict(X_test_scaled)

# Create Data Frame with Passenger ID and Prediction
final_predictions = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': preds})

final_predictions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nn_df['CabinType'] = codes


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [91]:
# Save predictions as csv file
final_predictions.to_csv('predictions.csv', index=False)
