# Model 01 for the Titanic Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette=sns.husl_palette(s=1), context='talk')

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
df.shape

(891, 12)

## Feature Engineering
### One-Hot Encoding of the 'Embarked' column

In [5]:
onehot = pd.get_dummies(df['Embarked']) 
df2 = pd.concat([df, onehot], axis=1) # <-- glues two dfs together horizontally
#df2.head()

### Target Encoding of the 'Sex' column
Replace a category by its mean target (y)

**Alternative** to One-Hot encoding that results in only one new feature.

In [6]:
means = df2.groupby('Sex')['Survived'].mean()
df2['sex_target_enc'] = df2['Sex'].replace (means.to_dict())
#df2

### Target encoding of the 'Pclass' column

In [7]:
means_class = df2.groupby('Pclass')['Survived'].mean()
#means_class 
df2['class_target_enc'] = df2['Pclass'].replace(means_class.to_dict())
#df2

### Use Binning for 'Age' column
Take a numerical column and convert it to multiple (one-hot encoded) categories

In [9]:
df2['child'] = (df['Age'] < 14).astype(int)
#df2

In [10]:
bins = pd.cut(df2['Age'], bins=4, labels=['age_bin1', 'age_bin2', 'age_bin3', 'age_bin4']) #creates 4 equally wide buckets
bins = pd.get_dummies(bins)          #creates 4 one-hot encoded features
df2 = pd.concat([df2, bins], axis=1)
bins.head()

Unnamed: 0,age_bin1,age_bin2,age_bin3,age_bin4
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0


In [11]:
#quantile bins
qbins = pd.qcut(df2['Age'], q=4, labels=['age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4']) #creates 4 buckets with the same number of passengers
qbins = pd.get_dummies(qbins) 
df2 = pd.concat([df2, qbins], axis=1)
qbins.head()

Unnamed: 0,age_qbin1,age_qbin2,age_qbin3,age_qbin4
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


### Count how many people per ticket and calculate fare per passenger

In [12]:
ppticket = df.groupby('Ticket')['PassengerId'].count().to_dict() # number of passengers for each ticket as dict
df2['pass_ticket'] = df['Ticket'].replace(ppticket)
df2['fare_per_pass'] = df2['Fare'] / df2['pass_ticket'] # fare per passenger
df2['fare_per_pass_scaled'] = (df2['fare_per_pass'] - df2['fare_per_pass'].min()) / (df2['fare_per_pass'].max() - df2['fare_per_pass'].min())

### Scaling the 'Age' columns

In [13]:
# min-max scaling: min=0, max=1.0
# subtract the minimum, devide by range
df2['Age_scaled'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
df2['Age_scaled'] = df2['Age_scaled'].fillna(df2['Age_scaled'].mean()) # missing values replace bei mean

In [14]:
df2['Age_scaled'].isna().sum()

0

**Hint:** There is a good way to do this using a scikit function!

### Interaction Terms for 'Pclass' and 'Embarked_C' as one example

In [15]:
df2['Pclass*C'] = df2['Pclass'] * df2['C']
df2.isna().any()

PassengerId             False
Survived                False
Pclass                  False
Name                    False
Sex                     False
Age                      True
SibSp                   False
Parch                   False
Ticket                  False
Fare                    False
Cabin                    True
Embarked                 True
C                       False
Q                       False
S                       False
sex_target_enc          False
class_target_enc        False
child                   False
age_bin1                False
age_bin2                False
age_bin3                False
age_bin4                False
age_qbin1               False
age_qbin2               False
age_qbin3               False
age_qbin4               False
pass_ticket             False
fare_per_pass           False
fare_per_pass_scaled    False
Age_scaled              False
Pclass*C                False
dtype: bool

In [16]:
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,age_bin4,age_qbin1,age_qbin2,age_qbin3,age_qbin4,pass_ticket,fare_per_pass,fare_per_pass_scaled,Age_scaled,Pclass*C
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,1,0,0,1,7.25,0.03269,0.271174,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,1,0,1,71.2833,0.321416,0.472229,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,1,0,0,1,7.925,0.035734,0.321438,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,0,0,1,0,2,26.55,0.119714,0.434531,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,0,1,0,1,8.05,0.036297,0.434531,0


## Define Model Data

In [27]:
# Split the data into X and y
y = df['Survived']
X = df2[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q']]

# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 9), (179, 9), (712,), (179,))

## Create a Baseline Model to Compare to

In [28]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent') 

In [29]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [30]:
dummy_clf.score(X_train, y_train) #accuracy that is achieved by dummy classifier

0.6137640449438202

## Create an Logistic Regression Model

In [31]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

In [32]:
lr_clf.fit(X_train, y_train) 


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
lr_clf.score(X_train, y_train)

0.8019662921348315

## Cross validation

In [34]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr_clf, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.75842697, 0.81460674, 0.79213483, 0.83707865])

In [35]:
scores.mean().round(3), scores.std().round(3)

(0.801, 0.029)

## Test against test data 

In [36]:
print('training score: ', lr_clf.score(X_train, y_train).round(3))
print('test score: ', lr_clf.score(X_test, y_test).round(3))

training score:  0.802
test score:  0.793
