# Model 01 for the Titanic Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette=sns.husl_palette(s=1), context='talk')

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
train.shape

(891, 12)

## Feature Engineering
### One-Hot Encoding of the 'Embarked' column

In [5]:
onehot = pd.get_dummies(train['Embarked']) 
train2 = pd.concat([train, onehot], axis=1) # <-- glues two dfs together horizontally
train2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


### Target Encoding of the 'Sex' column
Replace a category by its mean target (y)

**Alternative** to One-Hot encoding that results in only one new feature.

In [6]:
means = train2.groupby('Sex')['Survived'].mean()
train2['sex_target_enc'] = train2['Sex'].replace (means.to_dict())
train2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S,sex_target_enc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1,0.188908
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,0.742038
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1,0.742038
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,1,0.742038
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1,0.188908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0,1,0.188908
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,0,1,0.742038
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0,0,1,0.742038
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1,0,0,0.188908


### Target encoding of the 'Pclass' column

In [7]:
means_class = train2.groupby('Pclass')['Survived'].mean()
#means_class 
train2['class_target_enc'] = train2['Pclass'].replace(means_class.to_dict())
train2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S,sex_target_enc,class_target_enc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1,0.188908,0.242363
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,0.742038,0.629630
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1,0.742038,0.242363
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,1,0.742038,0.629630
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1,0.188908,0.242363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0,1,0.188908,0.472826
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,0,1,0.742038,0.629630
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0,0,1,0.742038,0.242363
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1,0,0,0.188908,0.629630


### Use Binning for 'Age' column
Take a numerical column and convert it to multiple (one-hot encoded) categories

In [8]:
train2['child'] = (train['Age'] < 14).astype(int)
train2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S,sex_target_enc,class_target_enc,child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1,0.188908,0.242363,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,0.742038,0.629630,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1,0.742038,0.242363,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,1,0.742038,0.629630,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1,0.188908,0.242363,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0,1,0.188908,0.472826,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,0,1,0.742038,0.629630,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0,0,1,0.742038,0.242363,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1,0,0,0.188908,0.629630,0


In [9]:
bins = pd.cut(train2['Age'], bins=4, labels=['age_bin1', 'age_bin2', 'age_bin3', 'age_bin4']) #creates 4 equally wide buckets
bins = pd.get_dummies(bins)          #creates 4 one-hot encoded features
train2 = pd.concat([train2, bins], axis=1)
bins.head()

Unnamed: 0,age_bin1,age_bin2,age_bin3,age_bin4
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0


In [25]:
#quantile bins
qbins = pd.qcut(train2['Age'], q=4, labels=['age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4']) #creates 4 buckets with the same number of passengers
qbins = pd.get_dummies(qbins) 
train2 = pd.concat([train2, qbins], axis=1)
qbins.head()

Unnamed: 0,age_qbin1,age_qbin2,age_qbin3,age_qbin4
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


### Count how many people per ticket and calculate fare per passenger

In [11]:
ppticket = train.groupby('Ticket')['PassengerId'].count().to_dict() # number of passengers for each ticket as dict
train2['pass_ticket'] = train['Ticket'].replace(ppticket)
train2['fare_per_pass'] = train2['Fare'] / train2['pass_ticket'] # fare per passenger
train2['fare_per_pass_scaled'] = (train2['fare_per_pass'] - train2['fare_per_pass'].min()) / (train2['fare_per_pass'].max() - train2['fare_per_pass'].min())

### Scaling the 'Age' columns

In [12]:
# min-max scaling: min=0, max=1.0
# subtract the minimum, devide by range
train2['Age_scaled'] = (train['Age'] - train['Age'].min()) / (train['Age'].max() - train['Age'].min())
train2['Age_scaled'] = train2['Age_scaled'].fillna(train2['Age_scaled'].mean()) # missing values replace bei mean

In [13]:
train2['Age_scaled'].isna().sum() # careful, still missing values!

0

**Hint:** There is a good way to do this using a scikit function!

### Interaction Terms for 'Pclass' and 'Embarked_C' as one example

In [14]:
train2['Pclass*C'] = train2['Pclass'] * train2['C']
train2.isna().any()

PassengerId             False
Survived                False
Pclass                  False
Name                    False
Sex                     False
Age                      True
SibSp                   False
Parch                   False
Ticket                  False
Fare                    False
Cabin                    True
Embarked                 True
C                       False
Q                       False
S                       False
sex_target_enc          False
class_target_enc        False
child                   False
age_bin1                False
age_bin2                False
age_bin3                False
age_bin4                False
pass_ticket             False
fare_per_pass           False
fare_per_pass_scaled    False
Age_scaled              False
Pclass*C                False
dtype: bool

In [15]:
train2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,child,age_bin1,age_bin2,age_bin3,age_bin4,pass_ticket,fare_per_pass,fare_per_pass_scaled,Age_scaled,Pclass*C
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,1,0,0,1,7.25,0.03269,0.271174,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,1,0,0,1,71.2833,0.321416,0.472229,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,1,0,0,1,7.925,0.035734,0.321438,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,0,1,0,0,2,26.55,0.119714,0.434531,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,1,0,0,1,8.05,0.036297,0.434531,0


## Define Model Data

In [26]:
# Split the data into X and y
y = train['Survived']
X = train2.drop(columns=['Survived'])
#X

## Create a Baseline Model to Compare to

In [17]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent') 

In [18]:
dummy_clf.fit(X, y)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [19]:
dummy_clf.score(X, y) #accuracy that is achieved by dummy classifier

0.6161616161616161

## Create an Logistic Regression Model

In [20]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

In [31]:
lr_clf.fit(X[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q']], y) 


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
lr_clf.score(X[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q']], y)

0.8035914702581369