In [55]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import StratifiedKFold as SK
from sklearn.model_selection import train_test_split

In [56]:
titanic = pd.read_csv('files/titanic.csv')
data = titanic.copy() # 1111111111
data[['Age', 'Fare']] = data[['Age', 'Fare']].fillna(data[['Age', 'Fare']].mean())



y = data['Survived']
X = data.drop(columns=['Survived'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
57,58,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C
717,718,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S
431,432,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,29.699118,1,0,376564,16.1,,S
633,634,1,"Parr, Mr. William Henry Marsh",male,29.699118,0,0,112052,0.0,,S
163,164,3,"Calic, Mr. Jovo",male,17.0,0,0,315093,8.6625,,S


In [61]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

categorical = ['Sex']
numeric_features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']
column_transformer = ColumnTransformer([
    ('ohe', OrdinalEncoder(), categorical),
    ('scaling', StandardScaler(), numeric_features) # how to not scaling?
])

pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=1e-1))
])
model = pipeline.fit(X_train, y_train)

def calcPR(model):
    tn, fp, fn, tp = confusion_matrix(y_test, model.predict(X_test)).ravel()
    P = tp/(tp+fp)
    R = tp/(tp+fn)
    print('Precision = {:5.2f}, Recall = {:5.2f}'.format(P, R))
    b = model.predict_proba(X_test)[:, 1]
    print('AUC-ROC: {:6.3f}'.format(roc_auc_score(y_test, b)))
calcPR(model)

Precision =  0.78, Recall =  0.79
AUC-ROC:  0.880


In [82]:
import xgboost
import catboost
import lightgbm

pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', xgboost.XGBClassifier(max_depth=8, eval_metric='logloss', n_estimators=100, random_state=42, learning_rate=1e-2, use_label_encoder=False))
])
model = pipeline.fit(X_train, y_train)
calcPR(model)

Precision =  0.78, Recall =  0.74
AUC-ROC:  0.919


In [89]:
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', catboost.CatBoostClassifier(max_depth=8, n_estimators=100, random_state=42, learning_rate=1e-2))
])
model = pipeline.fit(X_train, y_train)
calcPR(model)

0:	learn: 0.6895056	total: 5.37ms	remaining: 531ms
1:	learn: 0.6856974	total: 6.71ms	remaining: 329ms
2:	learn: 0.6821935	total: 7.69ms	remaining: 249ms
3:	learn: 0.6788248	total: 8.96ms	remaining: 215ms
4:	learn: 0.6757061	total: 11.8ms	remaining: 225ms
5:	learn: 0.6723289	total: 12.4ms	remaining: 194ms
6:	learn: 0.6694186	total: 15.2ms	remaining: 202ms
7:	learn: 0.6663830	total: 18.2ms	remaining: 209ms
8:	learn: 0.6634050	total: 19.1ms	remaining: 193ms
9:	learn: 0.6601264	total: 21.8ms	remaining: 196ms
10:	learn: 0.6572423	total: 22.4ms	remaining: 181ms
11:	learn: 0.6543041	total: 25.1ms	remaining: 184ms
12:	learn: 0.6514298	total: 28ms	remaining: 187ms
13:	learn: 0.6486847	total: 28.6ms	remaining: 176ms
14:	learn: 0.6456552	total: 30.5ms	remaining: 173ms
15:	learn: 0.6428188	total: 31.2ms	remaining: 164ms
16:	learn: 0.6404236	total: 34ms	remaining: 166ms
17:	learn: 0.6381566	total: 34.5ms	remaining: 157ms
18:	learn: 0.6357141	total: 35ms	remaining: 149ms
19:	learn: 0.6330155	total: 

In [90]:
calcPR(model)

Precision =  0.76, Recall =  0.66
AUC-ROC:  0.893
