## MS2: Titanic Machine Learning Workflow

### 1) Get data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
full_data = pd.read_csv("../data/train.csv")

In [3]:
test_for_kaggle = pd.read_csv("../data/test.csv")

### 2) Train-Test Split

#### Train-Validation split

In [4]:
X = full_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = full_data['Survived']

In [5]:
Xtrain, Xval, ytrain, yval= train_test_split(X,y, test_size=.25)

In [6]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((668, 7), (223, 7), (668,), (223,))

In [7]:
df_train = pd.concat([Xtrain,ytrain], axis=1)

In [30]:
df_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
814,3,male,30.5,0,0,8.0500,S,0
722,2,male,34.0,0,0,13.0000,S,0
247,2,female,24.0,0,2,14.5000,S,1
683,3,male,14.0,5,2,46.9000,S,0
657,3,female,32.0,1,1,15.5000,Q,0
...,...,...,...,...,...,...,...,...
153,3,male,40.5,0,2,14.5000,S,0
155,1,male,51.0,0,1,61.3792,C,0
129,3,male,45.0,0,0,6.9750,S,0
713,3,male,29.0,0,0,9.4833,S,0


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler

In [9]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


#### Perform a train test split

In [10]:
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=.2, random_state=420, stratify=y)

### 3) EDA

#### See also in MS1 data

In [11]:
df_train = pd.concat([Xtrain,ytrain], axis=1)

In [12]:
df_train.isna().sum()

Pclass        0
Sex           0
Age         150
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

### 4) Featuring Engineering

In [13]:
Xtrain

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
814,3,male,30.5,0,0,8.0500,S
722,2,male,34.0,0,0,13.0000,S
247,2,female,24.0,0,2,14.5000,S
683,3,male,14.0,5,2,46.9000,S
657,3,female,32.0,1,1,15.5000,Q
...,...,...,...,...,...,...,...
153,3,male,40.5,0,2,14.5000,S
155,1,male,51.0,0,1,61.3792,C
129,3,male,45.0,0,0,6.9750,S
713,3,male,29.0,0,0,9.4833,S


In [14]:
df_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
814,3,male,30.5,0,0,8.0500,S,0
722,2,male,34.0,0,0,13.0000,S,0
247,2,female,24.0,0,2,14.5000,S,1
683,3,male,14.0,5,2,46.9000,S,0
657,3,female,32.0,1,1,15.5000,Q,0
...,...,...,...,...,...,...,...,...
153,3,male,40.5,0,2,14.5000,S,0
155,1,male,51.0,0,1,61.3792,C,0
129,3,male,45.0,0,0,6.9750,S,0
713,3,male,29.0,0,0,9.4833,S,0


In [36]:
test_for_kaggle

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [15]:
df_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,712.0,562.0,712.0,712.0,712.0,712.0
mean,2.282303,30.059324,0.519663,0.375,32.098501,0.383427
std,0.845487,14.562479,1.141054,0.767551,47.539967,0.486563
min,1.0,0.42,0.0,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.8958,0.0
50%,3.0,29.0,0.0,0.0,14.4542,0.0
75%,3.0,39.0,1.0,0.0,31.3875,1.0
max,3.0,80.0,8.0,5.0,512.3292,1.0


#### What has to be changed?

+ Pclass = categorical (ohe), 
+ Sex = categorical (ohe),
+ Age = numerical, contouis (1st step imputation, simple imputer by median or mean -> 2nd scaling)
+ Embarked = categorical (1st step imputation, simple imputer by mode = most frequent -> 2nd ohe)

In [16]:
impute_and_then_ohe_age = make_pipeline(SimpleImputer(strategy="median"), MinMaxScaler())
impute_and_then_ohe_age

In [17]:
impute_and_then_ohe_embarked = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(sparse= False))
impute_and_then_ohe_embarked

In [None]:
impute_and_then_ohe_fare = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(sparse= False))
impute_and_then_ohe_fare

In [18]:
fe = ColumnTransformer([
    ("ohe_Sex_Pclass", OneHotEncoder(sparse=False), ["Sex", "Pclass"]),
    ("impute_and_then_ohe_age", impute_and_then_ohe_age,['Age']),
    ("impute_and_then_ohe_embarked", impute_and_then_ohe_embarked,['Embarked']),
    ("binning",KBinsDiscretizer(encode="onehot-dense"), ["Fare"])
])
fe

In [19]:
fe.fit(Xtrain)



In [20]:
Xtrain_tran = fe.transform(Xtrain)
pd.DataFrame(Xtrain_tran, columns=fe.get_feature_names_out())

Unnamed: 0,ohe_Sex_Pclass__Sex_female,ohe_Sex_Pclass__Sex_male,ohe_Sex_Pclass__Pclass_1,ohe_Sex_Pclass__Pclass_2,ohe_Sex_Pclass__Pclass_3,impute_and_then_ohe_age__Age,impute_and_then_ohe_embarked__Embarked_C,impute_and_then_ohe_embarked__Embarked_Q,impute_and_then_ohe_embarked__Embarked_S,binning__Fare_0.0,binning__Fare_1.0,binning__Fare_2.0,binning__Fare_3.0,binning__Fare_4.0
0,0.0,1.0,0.0,0.0,1.0,0.377984,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.421965,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.296306,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.170646,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.396833,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.0,1.0,0.0,0.0,1.0,0.503644,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
708,0.0,1.0,1.0,0.0,0.0,0.635587,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
709,0.0,1.0,0.0,0.0,1.0,0.560191,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
710,0.0,1.0,0.0,0.0,1.0,0.359135,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [21]:
Xval_tran = fe.transform(Xval)
pd.DataFrame(Xval_tran, columns=fe.get_feature_names_out())

Unnamed: 0,ohe_Sex_Pclass__Sex_female,ohe_Sex_Pclass__Sex_male,ohe_Sex_Pclass__Pclass_1,ohe_Sex_Pclass__Pclass_2,ohe_Sex_Pclass__Pclass_3,impute_and_then_ohe_age__Age,impute_and_then_ohe_embarked__Embarked_C,impute_and_then_ohe_embarked__Embarked_Q,impute_and_then_ohe_embarked__Embarked_S,binning__Fare_0.0,binning__Fare_1.0,binning__Fare_2.0,binning__Fare_3.0,binning__Fare_4.0
0,1.0,0.0,1.0,0.0,0.0,0.371701,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.334004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,0.359135,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.880623,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,1.0,0.0,0.0,1.0,0.0,0.346569,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
175,1.0,0.0,0.0,0.0,1.0,0.334004,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
176,0.0,1.0,0.0,0.0,1.0,0.321438,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
177,0.0,1.0,0.0,0.0,1.0,0.585323,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


### 5) Train models

#### Fit the model on the (transformed) training data

In [22]:
m_lgr = LogisticRegression()

In [23]:
m_lgr.fit(Xtrain_tran,ytrain)

In [24]:
m_lgr.score(Xtrain_tran,ytrain)

0.8061797752808989

In [58]:
x_test_kaggle= test_for_kaggle[['Pclass','Sex','Age','Fare','Embarked']]

In [60]:
x_test_kaggle.isna().sum()

Pclass       0
Sex          0
Age         86
Fare         1
Embarked     0
dtype: int64

In [61]:
# x_test_kaggle_tran= fe.transform(x_test_kaggle)

In [62]:
# X_kaggle = fe.transform(test_for_kaggle)
# pd.DataFrame(test_for_kaggle, columns=fe.get_feature_names_out())

In [32]:
ytrain

814    0
722    0
247    1
683    0
657    0
      ..
153    0
155    0
129    0
713    0
235    0
Name: Survived, Length: 712, dtype: int64

In [63]:
# m_lgr.fit(test_for_kaggle,ytrain)

In [64]:
# m_lgr.fit(Xtrain_tran, test_for_kaggle)

In [65]:
# test_for_kaggle.isnull()

### 6) Evaluate the model on the (transformed) test data

In [None]:
m_lgr.score(Xval_tran,yval)

## Applying Feature Engineering and Modeling in one go

In [None]:
fe

In [None]:
one_go_mlr= make_pipeline(fe, LogisticRegression(max_iter=10000))
one_go_mlr

In [None]:
one_go_mlr.fit(Xtrain,ytrain)

In [None]:
one_go_mlr.score(Xval,yval)

### Random Forest

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [41]:
rf = RandomForestClassifier(n_estimators=10, max_depth=2)
rf.fit(Xtrain_tran, ytrain)

In [49]:
rf.score(Xtrain_tran, ytrain).round(2)

0.8

In [42]:
rf.feature_importances_

array([0.42856688, 0.23402615, 0.13007069, 0.06926896, 0.01434228,
       0.04005362, 0.        , 0.        , 0.00231423, 0.        ,
       0.        , 0.        , 0.        , 0.0813572 ])

### MS 2: Build a Logistic Regression model

In [None]:
X_lgr = full_data[['Pclass']]
y_lgr = full_data['Survived']

In [None]:
X_lgr.shape, y_lgr.shape

In [None]:
y

In [None]:
m_lgr = LogisticRegression()

In [None]:
m_lgr.fit(X_lgr,y_lgr)

In [None]:
w_1 =m_lgr.coef_
w_0 =m_lgr.intercept_

In [None]:
print(f'Model feature coefficient :{w_1}\nModel intercept/bias: {w_0}')

In [None]:
m_lgr.classes_

In [None]:
estim_prob = m_lgr.predict_proba(X_lgr)
estim_prob.round(3)

In [None]:
estim_prob_df = pd.DataFrame(data=estim_prob, columns=m_lgr.classes_)
estim_prob_df

In [None]:
X_lgr_pred = (estim_prob_df[1]>=0.9).astype('int')
X_lgr_pred.head(50)

In [None]:
X_lgr_pred = m_lgr.predict(X_lgr)
X_lgr_pred

In [None]:
X_lgr_pred_10 =(estim_prob_df[1]>=0.1).astype('int')
X_lgr_pred_10.head(50)