In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
titanic = pd.read_csv('files/titanic.csv')
data = titanic.copy()
data[['Age', 'Fare']] = data[['Age', 'Fare']].fillna(data[['Age', 'Fare']].mean())

from sklearn.model_selection import StratifiedKFold as SK
from sklearn.model_selection import train_test_split

y = data['Survived']
X = data.drop(columns=['Survived'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
765,766,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51.0,1,0,13502,77.9583,D11,S
339,340,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,113784,35.5,T,S
374,375,3,"Palsson, Miss. Stina Viola",female,3.0,3,1,349909,21.075,,S
183,184,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

categorical = ['Pclass', 'Sex', 'SibSp', 'Parch']
numeric_features = ['Age', 'Fare',]
column_transformer = ColumnTransformer([
    ('Ordin', OrdinalEncoder(), categorical),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=42))
])

from sklearn.metrics import roc_auc_score
model = pipeline.fit(X_train, y_train)
b = model.predict_proba(X_test)[:, 1]
print('{:6.3f}'.format(roc_auc_score(y_test, b)))

 0.864


In [10]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, model.predict(X_test)).ravel()
tn, fp, fn, tp
P = tp/(tp+fp)
R = tp/(tp+fn)
'Precision = {:6.3f}, Recall = {:6.3f}'.format(P, R)

'Precision =  0.762, Recall =  0.681'

In [13]:
confusion_matrix(y_train, model.predict(X_train))

array([[342,  33],
       [ 73, 175]], dtype=int64)

In [5]:
coef = pipeline.named_steps['regression'].coef_[0,:]
categorical_processed = pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical)
numerical_processed = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(numeric_features)

In [6]:
all_names = np.hstack((categorical_processed, numerical_processed))
for i in range(all_names.shape[0]):
    print('{:11s}: {:8.3f}'.format(all_names[i], coef[i]))

Pclass_1   :    1.015
Pclass_2   :    0.000
Pclass_3   :   -0.830
Sex_female :    0.000
Sex_male   :   -2.596
SibSp_0    :    1.289
SibSp_1    :    1.142
SibSp_2    :    0.925
SibSp_3    :   -1.041
SibSp_4    :   -1.146
SibSp_5    :    0.000
SibSp_8    :    0.000
Parch_0    :    0.142
Parch_1    :    0.588
Parch_2    :    0.000
Parch_3    :    0.000
Parch_4    :    0.000
Parch_5    :   -0.206
Parch_6    :    0.000
Age        :   -0.328
Fare       :    0.045


In [7]:
from sklearn.model_selection import GridSearchCV
param = np.logspace(-1, 1, 20)
searcher = GridSearchCV(pipeline, [{'regression__C': param}], cv=10)
searcher.fit(X_train, y_train)
# pipeline.get_params().keys()
searcher.best_params_['regression__C']

2.9763514416313175

In [14]:
from sklearn.svm import SVC
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', SVC(gamma='scale', C=1, probability=True, random_state=42))
])
model = pipeline.fit(X_train, y_train)
b = model.predict_proba(X_test)[:, 1]
print('{:6.3f}'.format(roc_auc_score(y_test, b)))

 0.843


In [15]:
confusion_matrix(y_train, model.predict(X_train))

array([[342,  33],
       [ 73, 175]], dtype=int64)