In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 42 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Titanic Predict survival

In [None]:
import pandas as pd
import numpy as np

from pandas.core.common import random_state
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from catboost import Pool, CatBoostClassifier, cv

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

## Data exploration and preprocessing

### Train data

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/projects/predict_titanic_survival/train.csv')

In [None]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Data Dictionary**

| Variable | Definition	| Key |
|:--------:|:----------:|:---:| 
| survival | Survival	|0 = No, 1 = Yes |
| pclass | Ticket class |	1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex	| |
| Age	| Age in years | |	
| sibsp	| # of siblings / spouses aboard the Titanic | |	
| parch	| # of parents / children aboard the Titanic | |
| ticket | Ticket number | |
| fare | Passenger fare | 
| cabin | Cabin number | |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton|

In [None]:
df_train.columns = df_train.columns.str.lower()

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df_train['sibsp_parch'] = df_train['parch'] + df_train['sibsp']

In [None]:
def family_size(row):
  if row == 0:
    return 0
  elif row >= 3:
    return 1
  else:
    return 2

In [None]:
df_train['family_size'] = df_train['sibsp_parch'].apply(family_size)

In [None]:
df_train['level'] = df_train['cabin'].astype(str).str[0]

In [None]:
df_train['level'].value_counts()

n    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: level, dtype: int64

In [None]:
df_train['level'] = df_train['level'].replace(
    {'n': 0, 'C': 1, 'B': 2, 'D': 3, 'E': 4, 'A': 5, 'F': 6, 'G': 7, 'T': 8}
     )

In [None]:
df_train['sex'] = df_train['sex'].replace({'male': 0, 'female': 1})

In [None]:
df_train['sex'].value_counts()

0    577
1    314
Name: sex, dtype: int64

In [None]:
df_train['sex'] = df_train['sex'].astype('int')

In [None]:
df_train['age'] = df_train['age'].fillna(df_train['age'].mean())

In [None]:
df_train['age'] = df_train['age'].astype('int')

In [None]:
df_train['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [None]:
df_train['embarked'] = df_train['embarked'].replace({'S': 0, 'C': 1, 'Q': 2})

In [None]:
df_train['embarked'] = df_train['embarked'].fillna(2)

In [None]:
df_train['embarked'] = df_train['embarked'].astype('int')

In [None]:
df_train = df_train[['survived', 'pclass','sex', 'age', 'family_size', 'embarked', 'level']]

In [None]:
df_train.head()

Unnamed: 0,survived,pclass,sex,age,family_size,embarked,level
0,0,3,0,22,2,0,0
1,1,1,1,38,2,1,1
2,1,3,1,26,0,0,0
3,1,1,1,35,2,0,1
4,0,3,0,35,0,0,0


### Test data

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/projects/predict_titanic_survival/test.csv')

In [None]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
df_test.columns = df_test.columns.str.lower()

In [None]:
df_test_passengerid = df_test['passengerid']

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  418 non-null    int64  
 1   pclass       418 non-null    int64  
 2   name         418 non-null    object 
 3   sex          418 non-null    object 
 4   age          332 non-null    float64
 5   sibsp        418 non-null    int64  
 6   parch        418 non-null    int64  
 7   ticket       418 non-null    object 
 8   fare         417 non-null    float64
 9   cabin        91 non-null     object 
 10  embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
df_test['sibsp_parch'] = df_test['parch'] + df_test['sibsp']

In [None]:
def family_size(row):
  if row == 0:
    return 0
  elif row >= 3:
    return 1
  else:
    return 2

In [None]:
df_test['family_size'] = df_test['sibsp_parch'].apply(family_size)

In [None]:
df_test['level'] = df_test['cabin'].astype(str).str[0]

In [None]:
df_test['level'] = df_test['level'].replace(
    {'n': 0, 'C': 1, 'B': 2, 'D': 3, 'E': 4, 'A': 5, 'F': 6, 'G': 7, 'T': 8}
     )

In [None]:
df_test['sex'] = df_test['sex'].replace({'male': 0, 'female': 1})

In [None]:
df_test['sex'].value_counts()

0    266
1    152
Name: sex, dtype: int64

In [None]:
df_test['sex'] = df_test['sex'].astype('int')

In [None]:
df_test['age'] = df_test['age'].fillna(df_test['age'].mean())

In [None]:
df_test['age'] = df_test['age'].astype('int')

In [None]:
df_test['embarked'] = df_test['embarked'].replace({'S': 0, 'C': 1, 'Q': 2})

In [None]:
df_test['embarked'] = df_test['embarked'].fillna(2)

In [None]:
df_test['embarked'] = df_test['embarked'].astype('int')

In [None]:
df_test['fare'] = df_test['fare'].fillna(df_test['fare'].mean())

In [None]:
df_test = df_test[['pclass','sex', 'age', 'family_size', 'embarked', 'level']]

In [None]:
df_test.head()

Unnamed: 0,pclass,sex,age,family_size,embarked,level
0,3,0,34,0,2,0
1,3,1,47,2,0,0
2,2,0,62,0,2,0
3,3,0,27,0,0,0
4,3,1,22,2,0,0


## Model research

In [None]:
df_train['survived'].mean()

0.3838383838383838

In [None]:
features_train = df_train.drop('survived', axis=1)
target_train = df_train['survived']

In [None]:
features_train, features_valid = train_test_split(features_train, test_size=0.25, random_state=12345)
target_train, target_valid = train_test_split(target_train, test_size=0.25, random_state=12345)

### Random forest classifier

In [None]:
%%time


rf = RandomForestClassifier(random_state=123)

params = {'n_estimators': [350, 400],
          'max_depth': [3, 7, 10],
          'criterion':['gini'],
          'min_samples_leaf' : [1, 3, 5],
          'max_features':['auto'],
          'min_samples_split': [5, 10],
          'max_leaf_nodes':[3, 5],
          }

cv_rfc_model = GridSearchCV(estimator=rf, param_grid=params, n_jobs=-1, cv=10)

cv_rfc_model.fit(features_train, target_train)

CPU times: user 7.33 s, sys: 431 ms, total: 7.76 s
Wall time: 6min 44s


In [None]:
cv_rfc_model.best_params_
cv_rfc_model.best_score_

0.8187697874265039

In [None]:
rfc_best_model = RandomForestClassifier(max_depth=7, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=5, n_estimators=350, random_state=123)
rfc_best_model.fit(features_train, target_train)

RandomForestClassifier(max_depth=7, max_leaf_nodes=5, min_samples_split=5,
                       n_estimators=350, random_state=123)

In [None]:
rfc_best_model.score(features_valid, target_valid)

0.7668161434977578

In [None]:
%%time 

rfc = rfc_best_model.predict(df_test)
df_test['survived'] = list(rfc)

CPU times: user 70.4 ms, sys: 0 ns, total: 70.4 ms
Wall time: 71.5 ms


### Cat Boost classifier

In [None]:
model = CatBoostClassifier(eval_metric='Accuracy', use_best_model=True, random_seed=42)

In [None]:
model.fit(features_train, target_train, eval_set=(features_train, target_train))

In [None]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(target_valid, model.predict(features_valid))))

the test accuracy is :0.762332


In [None]:
CBC = CatBoostClassifier()

parameters = {'depth': [4, 5, 6, 7, 8], 
              'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05], 
              'iterations': [50, 55, 65, 75, 85, 95, 105], 
              'eval_metric' : ['Accuracy'],
              'verbose' : [5, 10 ,15]}

Grid_CBC = GridSearchCV(estimator=CBC, param_grid=parameters, cv=10, n_jobs=-1)
Grid_CBC.fit(features_train, target_train)

0:	learn: 0.8233533	total: 446us	remaining: 33ms
5:	learn: 0.8323353	total: 2.75ms	remaining: 31.6ms
10:	learn: 0.8413174	total: 4.29ms	remaining: 24.9ms
15:	learn: 0.8428144	total: 5.71ms	remaining: 21.1ms
20:	learn: 0.8458084	total: 7.25ms	remaining: 18.6ms
25:	learn: 0.8443114	total: 8.77ms	remaining: 16.5ms
30:	learn: 0.8428144	total: 10.4ms	remaining: 14.7ms
35:	learn: 0.8443114	total: 12ms	remaining: 13ms
40:	learn: 0.8443114	total: 13.5ms	remaining: 11.2ms
45:	learn: 0.8458084	total: 14.9ms	remaining: 9.39ms
50:	learn: 0.8473054	total: 16.4ms	remaining: 7.71ms
55:	learn: 0.8502994	total: 18ms	remaining: 6.12ms
60:	learn: 0.8517964	total: 19.5ms	remaining: 4.49ms
65:	learn: 0.8532934	total: 21ms	remaining: 2.87ms
70:	learn: 0.8532934	total: 22.7ms	remaining: 1.28ms
74:	learn: 0.8532934	total: 24ms	remaining: 0us


GridSearchCV(cv=10,
             estimator=<catboost.core.CatBoostClassifier object at 0x7ffada273310>,
             n_jobs=-1,
             param_grid={'depth': [4, 5, 6, 7, 8], 'eval_metric': ['Accuracy'],
                         'iterations': [50, 55, 65, 75, 85, 95, 105],
                         'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05],
                         'verbose': [5, 10, 15]})

In [None]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", Grid_CBC.best_estimator_)
print("\n The best score across ALL searched params:\n", Grid_CBC.best_score_)
print("\n The best parameters across ALL searched params:\n", <)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7ffad8057650>

 The best score across ALL searched params:
 0.8486431478968791

 The best parameters across ALL searched params:
 {'depth': 4, 'eval_metric': 'Accuracy', 'iterations': 75, 'learning_rate': 0.05, 'verbose': 5}


In [None]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(target_valid, Grid_CBC.predict(features_valid))))

the test accuracy is :0.784753


In [None]:
%%time 

Grid_CBC = Grid_CBC.predict(df_test)
df_test['survived'] = list(Grid_CBC)

### Exmaple CBC

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/projects/predict_titanic_survival/train.csv')

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/projects/predict_titanic_survival/test.csv')

In [None]:
train_df.fillna(-999,inplace=True)
test_df.fillna(-999,inplace=True)
x = train_df.drop('Survived',axis=1)
y = train_df.Survived
cate_features_index = np.where(x.dtypes != float)[0]
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.85,random_state=1234)


In [None]:
model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)

In [None]:
model.fit(xtrain,ytrain,cat_features=cate_features_index,eval_set=(xtest,ytest))

In [None]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest,model.predict(xtest))))

the test accuracy is :0.858209


In [None]:
pred = model.predict(test_df)
pred = pred.astype(np.int)
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':pred})

In [None]:
CBC = CatBoostClassifier()

parameters = {'depth': [4, 5, 6, 7, 8], 'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05], 
              'iterations': [75, 85, 95, 100, 105, 110], 
              'eval_metric' : ['Accuracy'],  }

Grid_CBC = GridSearchCV(estimator=CBC, param_grid = parameters, cv = 10, n_jobs=-1)
Grid_CBC.fit(xtrain,ytrain,cat_features=cate_features_index,eval_set=(xtest,ytest))

In [None]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", Grid_CBC.best_estimator_)
print("\n The best score across ALL searched params:\n", Grid_CBC.best_score_)
print("\n The best parameters across ALL searched params:\n", Grid_CBC.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7ffada416b10>

 The best score across ALL searched params:
 0.8138245614035087

 The best parameters across ALL searched params:
 {'depth': 6, 'eval_metric': 'Accuracy', 'iterations': 110, 'learning_rate': 0.05}


In [None]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest, Grid_CBC.predict(xtest))))

the test accuracy is :0.843284


In [None]:
pred = model.predict(test_df)
pred = pred.astype(np.int)
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':pred})

In [None]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [None]:
submission.to_csv('catboost.csv',index=False)

## Test of the best models

In [None]:
df_test['passengerid'] = df_test_passengerid
df_submission = df_test[['passengerid', 'survived']]
df_submission.tail()

Unnamed: 0,passengerid,survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [None]:
df_submission.to_csv('/content/drive/MyDrive/df_submission1.csv', index=False)

In [None]:
## 0.67942 - 1 заход
## 0.77272 - 2 заход
## 0.77990 - 3 заход
0.77511