# Exercise 2b: Feature engineering

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
X_train = pd.read_csv("ex2_train.csv")
y_train = pd.read_csv("ex2_class_train.csv")
X_test = pd.read_csv("ex2_test.csv")
y_test = pd.read_csv("ex2_class_test.csv")

In [7]:
# define a utility function to print out the prediction performance
def evaluate_result(y_test, y_pred, clf):
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(f'Precision: {precision_score(y_test, y_pred):.4f}')
    print(f'Recall: {recall_score(y_test, y_pred):.4f}')
    print(f'F1-score: {f1_score(y_test, y_pred):.4f}')
    print(f'AUC-ROC: {roc_auc_score(y_test, clf.predict_proba(X_test_processed)[:, 1]):.4f}')

## Prototyping (without feature engineering)

In [8]:
def preprocess(data_in):
    data = data_in.drop(columns=['Name']).copy()
    
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())

    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

    return data

In [9]:
X_train_processed = preprocess(X_train)
X_test_processed = preprocess(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model without Feature Engineering')
evaluate_result(y_test, y_pred, clf)

Random Forest Model without Feature Engineering
Accuracy: 0.8101
Precision: 0.7778
Recall: 0.7568
F1-score: 0.7671
AUC-ROC: 0.8732


## Feature engineering

The classification using simple preprocessed data gives only mediocre performance.

**TODO: You should make use of the insights from your EDA (ex2a) to complete the following feature engineering function below.** Later the function will replace the simple preprocessing.

You will pass the exercise if your feature engineering can improve the performance (i.e., winning in three or more metrics).

In [16]:
def feature_engineering(data_in):
    data = data_in.copy()

    data = preprocess(data)

    # Create Family Size feature
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    # Create IsAlone feature
    data['IsAlone'] = 0
    data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1
    
    # Convert Sex to numeric
    data['Sex'] = data['Sex'].map({'male': 0, 'female':1}).astype(int)
    
    # Fill missing Embarked with mode (assign back to the column)
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    
    # Convert Embarked to numeric
    data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)
    
    return data

In [15]:
X_train_processed = feature_engineering(X_train)
X_test_processed = feature_engineering(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_processed, y_train.values.ravel())
y_pred = clf.predict(X_test_processed)

print('Random Forest Model with Feature Engineering')
evaluate_result(y_test, y_pred, clf)

KeyError: "['Name'] not found in axis"