In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('train.csv')
data.head()
data[data['Sex'] == 'female']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Pre-processing

In [3]:
def preprocess_df(df):
    # Extract surnames from the "Name" column
    df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0].strip())

    # Create a new column for family size
    df['Family_Size'] = df.groupby('Surname')['Surname'].transform('count')

    # Add 1 to include the person with that surname itself
    df['Family_Size'] = df['Family_Size'] + 1


    # Encode the Cabin Type as a Feature and One-hot encode Sex and Embarked columns
    df['CabinType'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'CabinType'], dtype=int)

    # Also include a column representing if a passeneger has a column
    df['HasCabin'] = np.where(df['Cabin'].isna(), 0, 1)


    # Age column has several NaN values. Augment these with median age instead of removing
    median_age = df['Age'].median()
    df['Age'].fillna(median_age, inplace=True)


    # Extract titles from the "Name" column
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.')

    # One-hot encode the titles which occur three or more times into features
    titles_to_encode = ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev']

    for title in titles_to_encode:
        df['Title_' + title] = (df['Title'] == title).astype(int)


    # Drop irrelevant columns
    df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Title', 'Surname'], axis=1, inplace=True)

train_df = preprocess_df(data)

# Step 1: Split the data into training and test sets
X = train_df.drop(columns=['Survived'])  # Features (all columns except 'Survived')
y = train_df['Survived']  # Target variable

# Split the data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Apply Standard Scaling
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



## Train Classification Models

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    classifier.fit(X_train_scaled, y_train)  # Train the model
    y_pred = classifier.predict(X_test_scaled)  # Make predictions on the test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f'{name} Accuracy: {accuracy:.2f}')

Logistic Regression Accuracy: 0.80
Decision Tree Accuracy: 0.79
Random Forest Accuracy: 0.82
SVM Accuracy: 0.81


## Tuning Best Model

In [None]:
# Split the data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Apply Standard Scaling
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model on the entire feature set
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = rf_classifier.predict(X_test_scaled)
initial_accuracy = accuracy_score(y_test, y_pred)
print(f'Initial Accuracy: {initial_accuracy:.2f}')

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Rank features by importance
feature_ranking = pd.Series(feature_importances, index=X_train.columns).sort_values(ascending=False)

# Select the top K features
K = 20  # Replace with the desired number of top features
selected_features = feature_ranking[:K].index.tolist()

# Retrain the Random Forest model using only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

rf_classifier_selected = RandomForestClassifier(random_state=42)
rf_classifier_selected.fit(X_train_scaled, y_train)

# Evaluate the model with selected features on the test set
y_pred_selected = rf_classifier_selected.predict(X_test_scaled)
selected_accuracy = accuracy_score(y_test, y_pred_selected)
print(f'Accuracy with {K} selected features: {selected_accuracy:.2f}')

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest model
rf_classifier = RandomForestClassifier(random_state=42)

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV instance on the training data
grid_search.fit(X_train_scaled, y_train)

# Retrieve the best hyperparameters and the corresponding best estimator
best_params = grid_search.best_params_
best_rf_classifier = grid_search.best_estimator_

# Evaluate the model with the best hyperparameters on the test set
y_pred_best = best_rf_classifier.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test, y_pred_best)

print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy with Best Hyperparameters: {best_accuracy:.2f}')

## Neural Network

## Train Best Model

In [None]:
scaler = StandardScaler()

# Retrain the Random Forest model using only the selected features
X_train_selected = X_train[selected_features]

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train_selected)


# Create a Random Forest model
final_rf_classifier = RandomForestClassifier(random_state=42, max_depth = 10, min_samples_leaf =  2, min_samples_split = 5, n_estimators = 10)

# Fit the Model
final_rf_classifier.fit(X_train_scaled, y_train)

In [None]:
test_df = pd.read_csv('test.csv')

df = test_df.copy()

# Extract surnames from the "Name" column
df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0].strip())

# Create a new column for family size
df['Family_Size'] = df.groupby('Surname')['Surname'].transform('count')

# Add 1 to include the person with that surname itself
df['Family_Size'] = df['Family_Size'] + 1


# Encode the Cabin Type as a Feature and One-hot encode Sex and Embarked columns
df['CabinType'] = df['Cabin'].str[0]
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'CabinType'], dtype=int)

# Also include a column representing if a passeneger has a column
df['HasCabin'] = np.where(df['Cabin'].isna(), 0, 1)


# Age column has several NaN values. Augment these with median age instead of removing
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

# Fare column has several NaN values. Augment these with median fare instead of removing
median_fare = df['Fare'].median()
df['Fare'].fillna(median_fare, inplace=True)


# Extract titles from the "Name" column
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.')

# One-hot encode the titles which occur three or more times into features
titles_to_encode = ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev']

for title in titles_to_encode:
    df['Title_' + title] = (df['Title'] == title).astype(int)
    

# Drop irrelevant columns
df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Title', 'Surname'], axis=1, inplace=True)

# Select features and scale unseen data
X_test = df.copy()  # Features (all columns except 'Survived')
X_test_selected = X_test[selected_features]
X_test_scaled = scaler.transform(X_test_selected)

# Evaluate the model with the best hyperparameters on the test set
preds = final_rf_classifier.predict(X_test_scaled)

# Create Data Frame with Passenger ID and Prediction
final_predictions = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': preds})

final_predictions.head()

In [None]:
# Save predictions as csv file
final_predictions.to_csv('predictions.csv', index=False)
