In [145]:
from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 1. Data processing

In [146]:
titanic_train_data = pd.read_csv('~/Documents/Stepik + ODS/data/train.csv')
titanic_test_data = pd.read_csv('~/Documents/Stepik + ODS/data/test.csv')

In [147]:
titanic_test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [148]:
X_train = titanic_train_data.drop(['Survived', 'Ticket','Embarked'], axis=1)
y_train = titanic_train_data['Survived']
X_test = titanic_test_data.drop(['Ticket', 'Embarked'], axis=1)

In [149]:
X_test.groupby(['Sex', 'Pclass'])['Age'].mean()

Sex     Pclass
female  1         41.333333
        2         24.376552
        3         23.073400
male    1         40.520000
        2         30.940678
        3         24.525104
Name: Age, dtype: float64

In [150]:
X_train['Age'] = X_train.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.mean()))
X_test['Age'] = X_test.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.mean()))

In [151]:
X_train['Title'] = X_train['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
X_test['Title'] = X_test['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [152]:
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

In [153]:
X_train['Title'] = X_train['Title'].map(normalized_titles)
X_test['Title'] = X_test['Title'].map(normalized_titles)

In [154]:
X_train['Cabin'] = X_train[X_train['Cabin'].isna() == False]['Cabin'].apply(lambda x: 1)
X_train['Cabin'] = X_train['Cabin'].fillna(0)
X_test['Cabin'] = X_test[X_test['Cabin'].isna() == False]['Cabin'].apply(lambda x: 1)
X_test['Cabin'] = X_test['Cabin'].fillna(0)

In [155]:
X_train['Family'] = X_train['SibSp'] + X_train['Parch'] + 1
X_test['Family'] = X_test['SibSp'] + X_test['Parch'] + 1

In [156]:
X_train['Fare'] = X_train['Fare'].replace(0, X_train['Fare'].mean())
X_test['Fare'] = X_test['Fare'].replace(0, X_test['Fare'].mean()).fillna(X_test['Fare'].median())

In [157]:
X_train = X_train.drop(['Name'], axis=1).set_index('PassengerId')
X_test = X_test.drop(['Name'], axis=1).set_index('PassengerId')

In [158]:
from sklearn.preprocessing import OneHotEncoder

In [159]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [160]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [161]:
X_train[['Age', 'Fare']] = scaler.fit_transform(X_train[['Age', 'Fare']])
X_test[['Age', 'Fare']] = scaler.fit_transform(X_test[['Age', 'Fare']])

In [162]:
from sklearn.model_selection import train_test_split

In [163]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.333)

# 2. Predictions
## 2.1 Decision Tree

In [164]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [165]:
%%time
tree_clf = DecisionTreeClassifier(max_depth=3)
tree_clf.fit(X_train, y_train)
cross_val_score(tree_clf, X_train, y_train, cv=10)

CPU times: user 75.8 ms, sys: 0 ns, total: 75.8 ms
Wall time: 77.3 ms


array([0.88333333, 0.75      , 0.85      , 0.68333333, 0.71186441,
       0.84745763, 0.91525424, 0.77966102, 0.86440678, 0.79661017])

In [166]:
tree_clf_pred = tree_clf.predict(X_holdout)
accuracy_score(y_holdout, tree_clf_pred)

0.8282828282828283

## 2.2 Decision Tree with GridSearchCV

In [167]:
from sklearn.model_selection import GridSearchCV

In [177]:
parameters = {'max_depth' : range(1, 5), 'max_features' : range(1, 6)}

In [178]:
tree_clf = DecisionTreeClassifier()
tree_clf_grid = GridSearchCV(tree_clf, parameters, cv=10, n_jobs=-1)

In [179]:
%%time
tree_clf_grid.fit(X_train, y_train)

CPU times: user 229 ms, sys: 9.53 ms, total: 238 ms
Wall time: 585 ms




GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': range(1, 5), 'max_features': range(1, 6)},
             pre_dispatch='2*n_jobs', refit=True, return

In [180]:
tree_clf_grid.best_params_, tree_clf_grid.score(X_train, y_train)

({'max_depth': 3, 'max_features': 4}, 0.8198653198653199)

In [181]:
tree_clf_grid_pred = tree_clf_grid.predict(X_holdout)
accuracy_score(y_holdout, tree_clf_grid_pred)

0.8114478114478114

## 2.3 KNN

In [182]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [183]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])

In [184]:
knn_params = {'knn__n_neighbors': range(1, 50)}

In [185]:
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=10, n_jobs=-1, verbose=True)

In [186]:
%%time
knn_grid.fit(X_train, y_train)

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   12.3s


CPU times: user 1.74 s, sys: 69.2 ms, total: 1.81 s
Wall time: 16.2 s


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:   16.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=-1,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=Fals

In [187]:
knn_grid.best_params_, knn_grid.best_score_

({'knn__n_neighbors': 12}, 0.8215488215488216)

In [188]:
knn_grid_pred = knn_grid.predict(X_holdout)
accuracy_score(y_holdout, knn_grid_pred)

0.8114478114478114

## 2.4 Random Forest

In [241]:
from sklearn.ensemble import RandomForestClassifier

In [242]:
clf_rf = RandomForestClassifier()

In [243]:
parameters = {'max_depth' : range(1, 5), 'n_estimators' : range(100, 200, 10), 'max_features' : range(3, 10)}

In [244]:
grid_rf = GridSearchCV(clf_rf, parameters, cv=10, n_jobs=-1)

In [245]:
%%time
grid_rf.fit(X_train, y_train)



CPU times: user 17.4 s, sys: 473 ms, total: 17.8 s
Wall time: 5min 6s


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [262]:
grid_rf.best_params_, grid_rf.best_score_

({'max_depth': 3, 'max_features': 5, 'n_estimators': 130}, 0.835016835016835)

In [263]:
grid_rf_pred = grid_rf.predict(X_holdout)
accuracy_score(y_holdout, grid_rf_pred)

0.8249158249158249

In [264]:
best_clf = grid_rf.best_estimator_

In [265]:
pd.DataFrame({'features':list(X_train.columns), 'feature_importances':  best_clf.feature_importances_})\
                        .sort_values(by='feature_importances')

Unnamed: 0,features,feature_importances
14,Title_Royalty,0.000239
13,Title_Officer,0.004956
10,Title_Miss,0.007109
3,Parch,0.009392
9,Title_Master,0.017508
12,Title_Mrs,0.021186
2,SibSp,0.022019
1,Age,0.039784
6,Family,0.058793
4,Fare,0.063307


In [266]:
y_predicted = best_clf.predict(X_test)

In [267]:
submission = pd.DataFrame(y_predicted, index=X_test.index,columns=['Survived']).sort_values(by='PassengerId')

In [268]:
submission.to_csv('titanic_sub.csv')

## 2.5 Logit

In [253]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [254]:
c_values = np.logspace(-2, 2, 10)

In [255]:
logit = LogisticRegression()
logit_grid = GridSearchCV(estimator=logit, param_grid={'C': c_values}, n_jobs=-1, cv=10, verbose=1)

In [256]:
%%time
logit_grid.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


CPU times: user 180 ms, sys: 4.5 ms, total: 185 ms
Wall time: 622 ms


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [257]:
logit_grid.best_score_, logit_grid.best_params_

(0.82996632996633, {'C': 35.93813663804626})

In [258]:
logit_grid_pred = logit_grid.predict(X_holdout)
accuracy_score(y_holdout, logit_grid_pred)

0.8215488215488216

In [259]:
best_clf = logit_grid.best_estimator_

In [260]:
y_predicted = best_clf.predict(X_test)

In [261]:
submission = pd.DataFrame(y_predicted, index=X_test.index,columns=['Survived']).sort_values(by='PassengerId')
submission.to_csv('titanic_sub.csv')