# Create Feature Engineered Test Dataframe for the Logistic Regression Model for the Titanic Dataset

In [58]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette=sns.husl_palette(s=1), context='talk')

## Load data

In [59]:
df = pd.read_csv('test.csv')

In [60]:
df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [61]:
df.shape

(418, 11)

## Feature Engineering

One-Hot Encoding of the 'Embarked' column

In [62]:
onehot = pd.get_dummies(df['Embarked']) 
df2 = pd.concat([df, onehot], axis=1) # <-- glues two dfs together horizontally
#df2.head()

Target Encoding of the 'Sex' column (Alternative to be checked: One-Hot encoding)

In [65]:
#means = df2.groupby('Sex')['Survived'].mean() # values from train data set used
df2['sex_target_enc'] = df2['Sex'].replace({'female': 0.7420382165605095, 'male': 0.18890814558058924})
#df2

Use Quantile Binning for 'Age' column

In [66]:
#quantile bins
qbins = pd.qcut(df2['Age'], q=4, labels=['age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4']) #creates 4 buckets with the same number of passengers
qbins = pd.get_dummies(qbins) 
df2 = pd.concat([df2, qbins], axis=1)
qbins.head()

Unnamed: 0,age_qbin1,age_qbin2,age_qbin3,age_qbin4
0,0,0,1,0
1,0,0,0,1
2,0,0,0,1
3,0,1,0,0
4,0,1,0,0


Make column 'SibSp' binary

In [67]:
df2['SibSp_binary'] = (df['SibSp'] >= 1).astype(int)

## Define Model Data

In [72]:
X_testdataset = df2[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q', 'SibSp_binary']]

## Write Feature Engineered Set of Test Data to CSV

In [75]:
X_testdataset.to_csv('test_feateng_LogReg01.csv')