## Titanic: Machine Learning from Disaster

In [580]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

SEED = 1234
np.random.seed = SEED

In [581]:
train, test = pd.read_csv(
    'data/titanic/train.csv'
    # путь к вашему файлу train
), pd.read_csv( 
    'data/titanic/test.csv'
    # путь к вашему файлу test
)

In [582]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [541]:
#Удалим уж совсем бессмысленные столбцы
test = test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
train = train.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [542]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [543]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [544]:
#train.Age.value_counts().sort_index()

In [545]:
train['SimpleAge'] = pd.cut(train.Age, bins=[0, 18, 30, 60, 120], 
                   labels=['teen', 'young', 'adult', 'aged'])

test['SimpleAge'] = pd.cut(test.Age, bins=[0, 18, 30, 60, 120], 
                   labels=['teen', 'young', 'adult', 'aged'])

#test = test.drop(['Age'], axis=1)
#train = train.drop(['Age'], axis=1)

In [546]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SimpleAge
0,892,3,male,34.5,0,0,7.8292,Q,adult
1,893,3,female,47.0,1,0,7.0,S,adult
2,894,2,male,62.0,0,0,9.6875,Q,aged
3,895,3,male,27.0,0,0,8.6625,S,young
4,896,3,female,22.0,1,1,12.2875,S,young


In [547]:
test.SimpleAge.value_counts().sort_index()

teen      54
young    146
adult    121
aged      11
Name: SimpleAge, dtype: int64

In [548]:
#train.Fare.value_counts().sort_index()

In [549]:
train['SimpleFare'] = pd.cut(train.Fare, bins=[0, 8, 16, 32, 64, 128, 256, 514], 
                   labels=['one', 'two', 'three', 'four', 'five', 'six', 'seven'])

test['SimpleFare'] = pd.cut(test.Fare, bins=[0, 8, 16, 32, 64, 128, 256, 514], 
                   labels=['one', 'two', 'three', 'four', 'five', 'six', 'seven'])

#test = test.drop(['Fare'], axis=1)
#train = train.drop(['Fare'], axis=1)

In [550]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SimpleAge,SimpleFare
0,1,0,3,male,22.0,1,0,7.25,S,young,one
1,2,1,1,female,38.0,1,0,71.2833,C,adult,five
2,3,1,3,female,26.0,0,0,7.925,S,young,one
3,4,1,1,female,35.0,1,0,53.1,S,adult,four
4,5,0,3,male,35.0,0,0,8.05,S,adult,two


In [551]:
train.SimpleFare.value_counts().sort_index()

one      226
two      244
three    195
four      93
five      80
six       29
seven      9
Name: SimpleFare, dtype: int64

In [552]:
train.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Sex              object
Age             float64
SibSp             int64
Parch             int64
Fare            float64
Embarked         object
SimpleAge      category
SimpleFare     category
dtype: object

In [553]:
#Закодировали Sex цифрами вместо названий в test & train
encSex = preprocessing.LabelEncoder()
encSex.fit(pd.concat((test['Sex'], train['Sex'])))
test['Sex'] = encSex.transform(test['Sex'])
train['Sex'] = encSex.transform(train['Sex'])

In [554]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SimpleAge,SimpleFare
0,1,0,3,1,22.0,1,0,7.25,S,young,one
1,2,1,1,0,38.0,1,0,71.2833,C,adult,five
2,3,1,3,0,26.0,0,0,7.925,S,young,one
3,4,1,1,0,35.0,1,0,53.1,S,adult,four
4,5,0,3,1,35.0,0,0,8.05,S,adult,two


In [555]:
train.loc[train.Embarked.isnull(), 'Embarked'] = 'Unknown'
test.loc[test.Embarked.isnull(), 'Embarked'] = 'Unknown'

In [556]:
test.Embarked.value_counts().sort_index()

C    102
Q     46
S    270
Name: Embarked, dtype: int64

In [557]:
#Закодировали Embarked цифрами вместо названий в test & train
encEmbarked = preprocessing.LabelEncoder()
encEmbarked.fit(pd.concat((test['Embarked'], train['Embarked'])))
test['Embarked'] = encEmbarked.transform(test['Embarked'])
train['Embarked'] = encEmbarked.transform(train['Embarked'])

In [558]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SimpleAge,SimpleFare
0,1,0,3,1,22.0,1,0,7.25,2,young,one
1,2,1,1,0,38.0,1,0,71.2833,0,adult,five
2,3,1,3,0,26.0,0,0,7.925,2,young,one
3,4,1,1,0,35.0,1,0,53.1,2,adult,four
4,5,0,3,1,35.0,0,0,8.05,2,adult,two


In [559]:
test.SimpleAge = pd.Categorical(test.SimpleAge)

In [560]:
train.loc[train.Age.isnull(), 'Age'] = 15
test.loc[test.Age.isnull(), 'Age'] = 15

train.loc[train.Fare.isnull(), 'Fare'] = 15
test.loc[test.Fare.isnull(), 'Fare'] = 15

train.Age = train.Age.astype(np.int64)
train.Fare = train.Fare.astype(np.int64)
train.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Sex               int64
Age               int64
SibSp             int64
Parch             int64
Fare              int64
Embarked          int64
SimpleAge      category
SimpleFare     category
dtype: object

In [561]:
#Убираем тип category с помощью get_dummies, т.е. one hot encode
cols_to_transform = [ 'SimpleAge', 'SimpleFare' ]
train = pd.get_dummies(train, columns = cols_to_transform )
test = pd.get_dummies(test, columns = cols_to_transform )

In [562]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SimpleAge_teen,SimpleAge_young,SimpleAge_adult,SimpleAge_aged,SimpleFare_one,SimpleFare_two,SimpleFare_three,SimpleFare_four,SimpleFare_five,SimpleFare_six,SimpleFare_seven
0,1,0,3,1,22,1,0,7,2,0,1,0,0,1,0,0,0,0,0,0
1,2,1,1,0,38,1,0,71,0,0,0,1,0,0,0,0,0,1,0,0
2,3,1,3,0,26,0,0,7,2,0,1,0,0,1,0,0,0,0,0,0
3,4,1,1,0,35,1,0,53,2,0,0,1,0,0,0,0,1,0,0,0
4,5,0,3,1,35,0,0,8,2,0,0,1,0,0,1,0,0,0,0,0


In [563]:
from sklearn.model_selection import cross_val_score

In [497]:
#Подготовили данные так, что X_tr - таблица без id и Survived, а в y_tr сохранены Survived
#X_tr, y_tr = train.drop(['PassengerId', 'Survived'], axis=1), train['Survived']

In [499]:
SEED = 1234
np.random.seed = SEED

#from catboost import Pool, CatBoostClassifier
#clf = CatBoostClassifier(random_seed=SEED,max_hyperopt_evals=50)

from sklearn.svm import SVC
clf5 = SVC(probability=True)

from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier
clf1 = LGBMClassifier(max_bin=400, learning_rate=0.134, n_estimators=151, num_leaves=131)

from sklearn.ensemble import ExtraTreesClassifier
clf2 = ExtraTreesClassifier(random_state=SEED)

from sklearn.linear_model import LogisticRegression
clf3 = LogisticRegression(random_state=SEED)

from sklearn.neighbors import KNeighborsClassifier
clf4 = KNeighborsClassifier()

clf = VotingClassifier(estimators=[
    ('lg', clf1), ('et', clf2), ('lr', clf3), ('knn', clf4), ('svm', clf5)],
                         voting='hard', weights=[1,1,4,1,2])

In [500]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_tr, y_tr)
np.mean(scores)

0.80246913580246915

In [501]:
clf.fit(X_tr, y_tr)

VotingClassifier(estimators=[('lg', LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.134,
        max_bin=400, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=151, nthread=-1,
        num_leaves=131, objective='binary', reg_alpha=0, reg_lambda=0,...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='hard', weights=[1, 1, 4, 1, 2])

In [502]:
X_te = test.drop(['PassengerId'], axis=1)
y_te = clf.predict(X_te)

#for catboost
y_te = y_te.astype(np.int64)

ans = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_te})
ans.to_csv('ans_titanic.csv', index=False)