## Titanic: Machine Learning from Disaster

In [140]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

SEED = 1234
np.random.seed = SEED

In [141]:
train, test = pd.read_csv(
    'data/titanic/train.csv'
    # путь к вашему файлу train
), pd.read_csv( 
    'data/titanic/test.csv'
    # путь к вашему файлу test
)

In [142]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [143]:
#Удалим уж совсем бессмысленные столбцы
test = test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
train = train.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [144]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [145]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [146]:
#train.Age.value_counts().sort_index()

In [147]:
train['SimpleAge'] = pd.cut(train.Age, bins=[0, 12, 23, 30, 45, 120], 
                   labels=['child', 'teen', 'young', 'adult', 'aged'])

test['SimpleAge'] = pd.cut(test.Age, bins=[0, 12, 23, 30, 45, 120], 
                   labels=['child', 'teen', 'young', 'adult', 'aged'])

test = test.drop(['Age'], axis=1)
train = train.drop(['Age'], axis=1)

In [148]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Fare,Embarked,SimpleAge
0,892,3,male,0,0,7.8292,Q,adult
1,893,3,female,1,0,7.0,S,aged
2,894,2,male,0,0,9.6875,Q,aged
3,895,3,male,0,0,8.6625,S,young
4,896,3,female,1,1,12.2875,S,teen


In [149]:
test.SimpleAge.value_counts().sort_index()

adult    80
aged     52
child    25
teen     89
young    86
Name: SimpleAge, dtype: int64

In [150]:
#train.Fare.value_counts().sort_index()

In [151]:
train['SimpleFare'] = pd.cut(train.Fare, bins=[0, 8, 16, 32, 64, 128, 256, 514], 
                   labels=['one', 'two', 'three', 'four', 'five', 'six', 'seven'])

test['SimpleFare'] = pd.cut(test.Fare, bins=[0, 8, 16, 32, 64, 128, 256, 514], 
                   labels=['one', 'two', 'three', 'four', 'five', 'six', 'seven'])

test = test.drop(['Fare'], axis=1)
train = train.drop(['Fare'], axis=1)

In [152]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,SimpleAge,SimpleFare
0,1,0,3,male,1,0,S,teen,one
1,2,1,1,female,1,0,C,adult,five
2,3,1,3,female,0,0,S,young,one
3,4,1,1,female,1,0,S,adult,four
4,5,0,3,male,0,0,S,adult,two


In [153]:
train.SimpleFare.value_counts().sort_index()

five      80
four      93
one      226
seven      9
six       29
three    195
two      244
Name: SimpleFare, dtype: int64

In [154]:
train.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Sex              object
SibSp             int64
Parch             int64
Embarked         object
SimpleAge      category
SimpleFare     category
dtype: object

In [155]:
#Закодировали Sex цифрами вместо названий в test & train
encSex = preprocessing.LabelEncoder()
encSex.fit(pd.concat((test['Sex'], train['Sex'])))
test['Sex'] = encSex.transform(test['Sex'])
train['Sex'] = encSex.transform(train['Sex'])

In [156]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,SimpleAge,SimpleFare
0,1,0,3,1,1,0,S,teen,one
1,2,1,1,0,1,0,C,adult,five
2,3,1,3,0,0,0,S,young,one
3,4,1,1,0,1,0,S,adult,four
4,5,0,3,1,0,0,S,adult,two


In [157]:
train.loc[train.Embarked.isnull(), 'Embarked'] = 'Unknown'
test.loc[test.Embarked.isnull(), 'Embarked'] = 'Unknown'

In [158]:
test.Embarked.value_counts().sort_index()

C    102
Q     46
S    270
Name: Embarked, dtype: int64

In [159]:
#Закодировали Embarked цифрами вместо названий в test & train
encEmbarked = preprocessing.LabelEncoder()
encEmbarked.fit(pd.concat((test['Embarked'], train['Embarked'])))
test['Embarked'] = encEmbarked.transform(test['Embarked'])
train['Embarked'] = encEmbarked.transform(train['Embarked'])

In [160]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked,SimpleAge,SimpleFare
0,892,3,1,0,0,1,adult,one
1,893,3,0,1,0,2,aged,one
2,894,2,1,0,0,1,aged,two
3,895,3,1,0,0,2,young,two
4,896,3,0,1,1,2,teen,two


In [161]:
# Перевод категорий в числа

test.SimpleAge = test.SimpleAge.astype('category').cat.codes
test.SimpleFare = test.SimpleFare.astype('category').cat.codes

train.SimpleAge = train.SimpleAge.astype('category').cat.codes
train.SimpleFare = train.SimpleFare.astype('category').cat.codes

#test.SimpleAge = pd.Categorical(test.SimpleAge)

In [162]:
#train.loc[train.Age.isnull(), 'Age'] = 15
#test.loc[test.Age.isnull(), 'Age'] = 15

#train.loc[train.Fare.isnull(), 'Fare'] = 15
#test.loc[test.Fare.isnull(), 'Fare'] = 15

#train.Age = train.Age.astype(np.int64)
#train.Fare = train.Fare.astype(np.int64)
test.dtypes

PassengerId    int64
Pclass         int64
Sex            int64
SibSp          int64
Parch          int64
Embarked       int64
SimpleAge       int8
SimpleFare      int8
dtype: object

In [163]:
#Убираем тип category с помощью get_dummies, т.е. one hot encode
#cols_to_transform = [ 'SimpleAge', 'SimpleFare' ]
#train = pd.get_dummies(train, columns = cols_to_transform )
#test = pd.get_dummies(test, columns = cols_to_transform )

In [164]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,SimpleAge,SimpleFare
0,1,0,3,1,1,0,2,3,2
1,2,1,1,0,1,0,0,0,0
2,3,1,3,0,0,0,2,4,2
3,4,1,1,0,1,0,2,0,1
4,5,0,3,1,0,0,2,0,6


In [313]:
#Подготовили данные так, что X_tr - таблица без id и Survived, а в y_tr сохранены Survived
X_tr, y_tr = train.drop(['PassengerId', 'Survived'], axis=1), train['Survived']

In [314]:
#Сделаем скейлинг данных
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_tr)
#Преобразуем тренировочные данные
X_tr_scaled = scaler.transform(X_tr)
#Преобразуем тестовые данные
X_te = test.drop(['PassengerId'], axis=1)
X_te_scaled = scaler.transform(X_te)

In [315]:
from sklearn.preprocessing import PolynomialFeatures #Препроцессинг 
from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(PolynomialFeatures(), LGBMClassifier())
pipe

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('lgbmclassifier', LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=10, nthread=-1,
        num_leaves=31, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=0, silent=True, subsample=1, subsample_for_bin=50000,
        subsample_freq=1))])

In [316]:
param_grid = {
    #'polynomialfeatures__degree': [1, 2, 3],
    #'lgbmclassifier__learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9],
    #'lgbmclassifier__max_bin': [200, 250, 300, 400, 500]
    #'lgbmclassifier__n_estimators': [110, 120, 130, 140, 150, 160],
    #'lgbmclassifier__num_leaves': [110, 120, 130, 140]
    'polynomialfeatures__degree': [3],
    'lgbmclassifier__learning_rate': [0.4],
    'lgbmclassifier__max_bin': [150],
    #'lgbmclassifier__n_estimators': [10],
    #'lgbmclassifier__num_leaves': [45]
}

In [317]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)

clf.fit(X_tr_scaled, y_tr)
y_te = clf.predict(X_te_scaled)
print("Наилучшие параметры: {}".format(clf.best_params_))

Наилучшие параметры: {'lgbmclassifier__learning_rate': 0.4, 'lgbmclassifier__max_bin': 150, 'polynomialfeatures__degree': 3}


In [318]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=1234)

scores = cross_val_score(clf, X_tr_scaled, y_tr, cv=kf, n_jobs=-1)
print("Среднее значение кросс-валидации: {}".format(np.mean(scores)))

Среднее значение кросс-валидации: 0.817048521750047


SEED = 1234
np.random.seed = SEED

#from catboost import Pool, CatBoostClassifier
#clf = CatBoostClassifier(random_seed=SEED,max_hyperopt_evals=50)

from sklearn.svm import SVC
clf5 = SVC(probability=True)

from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier
clf1 = LGBMClassifier(max_bin=400, learning_rate=0.134, n_estimators=151, num_leaves=131)

from sklearn.ensemble import ExtraTreesClassifier
clf2 = ExtraTreesClassifier(random_state=SEED)

from sklearn.linear_model import LogisticRegression
clf3 = LogisticRegression(random_state=SEED)

from sklearn.neighbors import KNeighborsClassifier
clf4 = KNeighborsClassifier()

clf = VotingClassifier(estimators=[
    ('lg', clf1), ('et', clf2), ('lr', clf3), ('knn', clf4), ('svm', clf5)],
                         voting='hard', weights=[1,1,4,1,2])

In [319]:
ans = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_te})
ans.to_csv('ans_titanic.csv', index=False)