# Simulated Annealing for Feature Selection
## 01 - Data Preparation
- Download and pre-process Titanic dataset for downstream feature selection
- Adapted from: https://www.kaggle.com/code/abhishekmamidi/titanic-data-preprocessing-and-visualization/notebook
___

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = '../data/'

In [3]:
train_data = pd.read_csv(PATH + 'raw/train.csv')
test_data = pd.read_csv(PATH + 'raw/test.csv')

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Create a new feature 'Family size' from the features 'SibSp' and 'Parch'
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

In [6]:
# Remove unnecessary columns
train_data = train_data.drop(columns=['Ticket', 'PassengerId', 'Cabin'])

In [7]:
# Map 'Sex' and 'Embarked' to numerical values
train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
train_data['Embarked'] = train_data['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [8]:
# Preprocess name and retrieve Title
train_data['Title'] = train_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train_data = train_data.drop(columns='Name')

In [9]:
# For Title, combine some of the classes and group all the rare classes into 'Others'.
train_data['Title'] = train_data['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Sir', 'Jonkheer', 'Lady', 'Capt', 'Don'], 'Others')
train_data['Title'] = train_data['Title'].replace('Ms', 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')
train_data['Title'] = train_data['Title'].replace('Mlle', 'Miss')

In [10]:
# Map Title to numerical values
train_data['Title'] = train_data['Title'].map({'Master':0, 'Miss':1, 'Mr':2, 'Mrs':3, 'Others':4})

In [11]:
# Fill NA in Embarked column with majority class
train_data['Embarked'] = train_data['Embarked'].fillna(2)

In [12]:
# Fill Age with the median age of similar rows from 'Pclass', 'SibSp' and 
# 'Parch'. If there are no similar rows, fill the age with the median age of 
# total dataset.
NaN_indexes = train_data['Age'][train_data['Age'].isnull()].index

for i in NaN_indexes:
    pred_age = train_data['Age'][((train_data.SibSp == train_data.iloc[i]["SibSp"]) & (train_data.Parch == train_data.iloc[i]["Parch"]) & (train_data.Pclass == train_data.iloc[i]["Pclass"]))].median()
    if not np.isnan(pred_age):
        train_data['Age'].iloc[i] = pred_age
    else:
        train_data['Age'].iloc[i] = train_data['Age'].median()

In [13]:
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,0,3,0,22.0,1,0,7.25,2.0,2,2
1,1,1,1,38.0,1,0,71.2833,0.0,2,3
2,1,3,1,26.0,0,0,7.925,2.0,1,1
3,1,1,1,35.0,1,0,53.1,2.0,2,3
4,0,3,0,35.0,0,0,8.05,2.0,1,2
5,0,3,0,26.0,0,0,8.4583,1.0,1,2
6,0,1,0,54.0,0,0,51.8625,2.0,1,2
7,0,3,0,2.0,3,1,21.075,2.0,5,0
8,1,3,1,27.0,0,2,11.1333,2.0,3,3
9,1,2,1,14.0,1,0,30.0708,0.0,2,3


#### Train-test split

In [14]:
X_train = train_data.drop(columns='Survived')
y_train = train_data.Survived
y_train = pd.DataFrame({'Survived':y_train.values})

In [15]:
X_train.shape

(891, 9)

In [16]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,3,0,22.0,1,0,7.25,2.0,2,2
1,1,1,38.0,1,0,71.2833,0.0,2,3
2,3,1,26.0,0,0,7.925,2.0,1,1
3,1,1,35.0,1,0,53.1,2.0,2,3
4,3,0,35.0,0,0,8.05,2.0,1,2


In [18]:
y_train.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [19]:
X_train.to_csv(PATH + '/processed/X_train.csv', index=False)
y_train.to_csv(PATH + '/processed/y_train.csv', index=False)