-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic.py
80 lines (65 loc) · 2.35 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from ibreakdown import IClassificationExplainer
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
def read_dataset(columns, seed=None):
url = (
'https://web.stanford.edu/class/archive/'
'cs/cs109/cs109.1166/stuff/titanic.csv'
)
df = pd.read_csv(url)
y = df['Survived']
X = df[columns]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=seed
)
return X_train, X_test, y_train, y_test
def build_model(num_features, cat_features, seed=None):
preprocess = make_column_transformer(
(make_pipeline(SimpleImputer(), StandardScaler()), num_features),
(OneHotEncoder(categories='auto'), cat_features),
)
model = make_pipeline(
preprocess, RandomForestClassifier(random_state=seed)
)
return model
def train_model(X_train, y_train, seed=None):
rf = build_model([0, 1, 2, 3], [4, 5], seed=seed)
param_grid = {
'columntransformer__pipeline__simpleimputer__strategy': ['mean'],
'randomforestclassifier__min_samples_leaf': [5],
'randomforestclassifier__min_samples_split': [12],
'randomforestclassifier__n_estimators': [100],
}
gs = GridSearchCV(
estimator=rf, param_grid=param_grid, scoring='roc_auc', cv=3, n_jobs=-1
)
gs.fit(X_train, y_train)
return gs.best_estimator_
def main():
seed = 42
columns = [
'Age', # num
'Fare', # num
'Siblings/Spouses Aboard', # num
'Parents/Children Aboard', # num
'Pclass', # cat
'Sex', # cat
]
X_train, X_test, y_train, y_test = read_dataset(columns, seed)
model = train_model(X_train, y_train, seed=seed)
class_map = ['Deceased', 'Survived']
classes = [class_map[i] for i in model.classes_]
explainer = IClassificationExplainer(model.predict_proba)
explainer.fit(X_train, columns, classes)
for i in range(10):
observation = X_test[i: i + 1]
exp = explainer.explain(observation)
exp.print()
if __name__ == '__main__':
main()