In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,roc_curve,roc_auc_score,auc, f1_score,recall_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from catboost import Pool, CatBoostClassifier, cv

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
def report(true, pred):
    print(classification_report(true, pred))
    print(confusion_matrix(true, pred))
    print("Accuracy  ",accuracy_score(true, pred))
    print("Precision ",precision_score(true, pred))
    print("Recall    ",recall_score(true, pred))
    print("F1 score  ",f1_score(true, pred))
    print("AUC ROC   ", roc_auc_score(true, pred))

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [5]:
test_ids = test['PassengerId']

In [6]:
train.drop('PassengerId', axis = 1, inplace = True)
test.drop('PassengerId', axis = 1, inplace = True)

In [7]:
train.fillna(-999,inplace=True)
test.fillna(-999,inplace=True)

In [8]:
train.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [9]:
test.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [10]:
y = train['Survived']

X = train.drop('Survived', axis = 1)

In [11]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1, stratify=y)

In [13]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
560,3,"Morrow, Mr. Thomas Rowan",male,-999.0,0,0,372622,7.7500,-999,Q
815,1,"Fry, Mr. Richard",male,-999.0,0,0,112058,0.0000,B102,S
21,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0000,D56,S
354,3,"Yousif, Mr. Wazli",male,-999.0,0,0,2647,7.2250,-999,C
665,2,"Hickman, Mr. Lewis",male,32.0,2,0,S.O.C. 14879,73.5000,-999,S
...,...,...,...,...,...,...,...,...,...,...
193,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0000,F2,S
265,2,"Reeves, Mr. David",male,36.0,0,0,C.A. 17248,10.5000,-999,S
309,1,"Francatelli, Miss. Laura Mabel",female,30.0,0,0,PC 17485,56.9292,E36,C
413,2,"Cunningham, Mr. Alfred Fleming",male,-999.0,0,0,239853,0.0000,-999,S


In [14]:
#train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     891 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [16]:
cat_features_index = np.where(X.dtypes != float)[0]
cat_features_index

array([0, 1, 2, 4, 5, 6, 8, 9], dtype=int64)

In [17]:
params = {
    'verbose': 10,
    'random_seed':1, 
    'thread_count': 4,
    'eval_metric' : 'Accuracy',
    'loss_function': 'Logloss',
    'iterations' : 200
}

scores = cv(Pool(X,y,cat_features=cat_features_index,thread_count=-1),
            params,fold_count=5,stratified=True,plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 0.7963483	test: 0.7765363	best: 0.7765363 (0)	total: 173ms	remaining: 34.4s
10:	learn: 0.8160112	test: 0.7988827	best: 0.7988827 (4)	total: 329ms	remaining: 5.65s
20:	learn: 0.8117978	test: 0.7988827	best: 0.7988827 (4)	total: 498ms	remaining: 4.24s
30:	learn: 0.8244382	test: 0.7988827	best: 0.7988827 (4)	total: 661ms	remaining: 3.6s
40:	learn: 0.8356742	test: 0.7988827	best: 0.7988827 (4)	total: 851ms	remaining: 3.3s
50:	learn: 0.8426966	test: 0.7932961	best: 0.8044693 (49)	total: 1.05s	remaining: 3.07s
60:	learn: 0.8426966	test: 0.7821229	best: 0.8044693 (49)	total: 1.26s	remaining: 2.88s
70:	learn: 0.8441011	test: 0.7821229	best: 0.8044693 (49)	total: 1.48s	remaining: 2.68s
80:	learn: 0.8511236	test: 0.7765363	best: 0.8044693 (49)	total: 1.66s	remaining: 2.44s
90:	learn: 0.8595506	test: 0.7821229	best: 0.8044693 (49)	total: 1.87s	remaining: 2.24s
100:	learn: 0.8637640	test: 0.7877095	best: 0.8044693 (49)	total: 2.03s	remaining: 1.99s
110:	learn: 0.86

70:	learn: 0.8459384	test: 0.8192090	best: 0.8192090 (70)	total: 1.28s	remaining: 2.32s
80:	learn: 0.8473389	test: 0.8305085	best: 0.8305085 (74)	total: 1.45s	remaining: 2.13s
90:	learn: 0.8585434	test: 0.8305085	best: 0.8418079 (88)	total: 1.62s	remaining: 1.94s
100:	learn: 0.8613445	test: 0.8418079	best: 0.8418079 (88)	total: 1.78s	remaining: 1.74s
110:	learn: 0.8683473	test: 0.8418079	best: 0.8418079 (88)	total: 2s	remaining: 1.6s
120:	learn: 0.8669468	test: 0.8531073	best: 0.8531073 (111)	total: 2.14s	remaining: 1.4s
130:	learn: 0.8669468	test: 0.8531073	best: 0.8587571 (126)	total: 2.29s	remaining: 1.2s
140:	learn: 0.8683473	test: 0.8531073	best: 0.8587571 (126)	total: 2.42s	remaining: 1.01s
150:	learn: 0.8725490	test: 0.8531073	best: 0.8587571 (126)	total: 2.55s	remaining: 828ms
160:	learn: 0.8753501	test: 0.8474576	best: 0.8587571 (126)	total: 2.71s	remaining: 655ms
170:	learn: 0.8767507	test: 0.8474576	best: 0.8587571 (126)	total: 2.82s	remaining: 478ms
180:	learn: 0.8809524	te

In [18]:
scores

Unnamed: 0,iterations,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.801334,0.029137,0.799375,0.009561,0.676965,0.001688,0.676446,0.001367
1,1,0.793457,0.020402,0.810881,0.011744,0.660812,0.002861,0.659451,0.001446
2,2,0.797926,0.027640,0.813970,0.007868,0.645819,0.004247,0.644276,0.001731
3,3,0.796809,0.025658,0.815373,0.008765,0.632433,0.004465,0.629496,0.003107
4,4,0.795679,0.029501,0.817056,0.007035,0.619859,0.005288,0.616407,0.003622
...,...,...,...,...,...,...,...,...,...
195,195,0.827178,0.035879,0.884399,0.007605,0.410119,0.052944,0.322240,0.017509
196,196,0.826054,0.036454,0.884399,0.006712,0.410147,0.053033,0.321828,0.017509
197,197,0.826054,0.036454,0.884399,0.006007,0.410143,0.053100,0.321643,0.017400
198,198,0.826054,0.036454,0.884960,0.006361,0.410162,0.053111,0.321085,0.017490


In [19]:
cb = CatBoostClassifier(random_state = 1,#learning_rate=0.01, 
                              loss_function="Logloss", verbose=200, eval_metric='Accuracy', thread_count=-1)

In [20]:
cb.fit(X_train, y_train, plot=True, use_best_model=True,eval_set=(X_val, y_val),
       cat_features=cat_features_index)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.029583
0:	learn: 0.8084544	test: 0.7761194	best: 0.7761194 (0)	total: 24.2ms	remaining: 24.2s
200:	learn: 0.8877147	test: 0.7910448	best: 0.7985075 (7)	total: 3.51s	remaining: 13.9s
400:	learn: 0.9167768	test: 0.7835821	best: 0.7985075 (7)	total: 7.32s	remaining: 10.9s
600:	learn: 0.9299868	test: 0.7910448	best: 0.7985075 (7)	total: 11.1s	remaining: 7.38s
800:	learn: 0.9458388	test: 0.7835821	best: 0.7985075 (7)	total: 14.8s	remaining: 3.69s
999:	learn: 0.9603699	test: 0.7910448	best: 0.7985075 (7)	total: 18.8s	remaining: 0us

bestTest = 0.7985074627
bestIteration = 7

Shrink model to first 8 iterations.


<catboost.core.CatBoostClassifier at 0x2395b487f70>

In [21]:
train_pred = cb.predict(X_train)
y_pred = cb.predict(test)
val_pred = cb.predict(X_val)

In [22]:
report(y_val,val_pred)

              precision    recall  f1-score   support

           0       0.80      0.90      0.85        83
           1       0.80      0.63      0.70        51

    accuracy                           0.80       134
   macro avg       0.80      0.77      0.78       134
weighted avg       0.80      0.80      0.79       134

[[75  8]
 [19 32]]
Accuracy   0.7985074626865671
Precision  0.8
Recall     0.6274509803921569
F1 score   0.7032967032967032
AUC ROC    0.7655327191117411


In [23]:
report(y_train,train_pred)

              precision    recall  f1-score   support

           0       0.80      0.92      0.86       466
           1       0.83      0.64      0.72       291

    accuracy                           0.81       757
   macro avg       0.82      0.78      0.79       757
weighted avg       0.81      0.81      0.81       757

[[428  38]
 [104 187]]
Accuracy   0.8124174372523117
Precision  0.8311111111111111
Recall     0.6426116838487973
F1 score   0.7248062015503876
AUC ROC    0.7805333097355573


In [24]:
cb.get_feature_importance()

array([31.38087314,  0.        , 56.59150509,  0.        ,  3.78813603,
        1.2131822 ,  0.        ,  2.53276468,  1.05252002,  3.44101884])

In [25]:
feat_importance = [col for col in zip(X_train.columns,cb.get_feature_importance() )]
feat_importance_df = pd.DataFrame(feat_importance, columns=['Feature','Feature Importance'])
feat_importance_df = feat_importance_df.sort_values('Feature Importance', ascending = False)
feat_importance_df

Unnamed: 0,Feature,Feature Importance
2,Sex,56.591505
0,Pclass,31.380873
4,SibSp,3.788136
9,Embarked,3.441019
7,Fare,2.532765
5,Parch,1.213182
8,Cabin,1.05252
1,Name,0.0
3,Age,0.0
6,Ticket,0.0


In [26]:
cb_baseline = pd.DataFrame(test_ids, columns=['PassengerId'])
cb_baseline['Survived'] = y_pred
cb_baseline

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [27]:
#cb_baseline.to_csv("cb_baseline_new.csv",index=False)