In [2]:
import os
import math
import datetime
from mlsettings.settings import load_app_config, get_datafolder_path
from mltools.mlcommon import (load_data, print_dataset_info, split_dataset, 
                              auto_scatter_simple,load_dataset,detect_outliers,
                              one_hot_dataframe)

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
from numpy import set_printoptions
set_printoptions(precision=4)

pd.set_option('display.width', 140)
pd.set_option('precision', 4)

sns.set_style("whitegrid")

sns.set_palette(sns.color_palette("muted", 15))

In [3]:
load_app_config()
DIRECTORY="kaggle_titanic"
TRAIN_FILE ='train.csv'
TEST_FILE = 'test.csv'
RESPONSE = 'Survived'
input_path = get_datafolder_path()

train_file = os.path.join(input_path, DIRECTORY, TRAIN_FILE)
test_file = os.path.join(input_path, DIRECTORY, TEST_FILE)
print(train_file)
print(test_file)

train_dataset,feature_columns,response_column,continuous_vars,categorical_vars = load_dataset(train_file,RESPONSE)
test_dataset,tfeature_columns,tresponse_column,tcontinuous_vars,tcategorical_vars  = load_dataset(test_file,RESPONSE,colseparator=',')

train_X = train_dataset[feature_columns]
train_y = train_dataset[response_column]
test_X  = test_dataset[tfeature_columns]

D:\DataSource\kaggle_titanic\train.csv
D:\DataSource\kaggle_titanic\test.csv
 input file is :D:\DataSource\kaggle_titanic\train.csv loaded.
Continous Variables
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical Variables
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
 input file is :D:\DataSource\kaggle_titanic\test.csv loaded.
Continous Variables
['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical Variables
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [4]:
def get_df_description(df):
    df_description = df.describe(include='all').T
    df_description['null_count'] = df.isnull().sum()
    return df_description

In [5]:
full_dataset =pd.concat(objs=[train_dataset, test_dataset], axis=0).reset_index(drop=True)  

In [6]:
from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
get_df_description(full_dataset)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count
Age,1046,,,,29.88,14.41,0.17,21.0,28.0,39.0,80.0,263
Cabin,295,186.0,C23 C25 C27,6.0,,,,,,,,1014
Embarked,1307,3.0,S,914.0,,,,,,,,2
Fare,1308,,,,33.3,51.76,0.0,7.896,14.45,31.27,512.3,1
Name,1309,1307.0,"Kelly, Mr. James",2.0,,,,,,,,0
Parch,1309,,,,0.385,0.8656,0.0,0.0,0.0,0.0,9.0,0
PassengerId,1309,,,,655.0,378.0,1.0,328.0,655.0,982.0,1309.0,0
Pclass,1309,,,,2.295,0.8378,1.0,2.0,3.0,3.0,3.0,0
Sex,1309,2.0,male,843.0,,,,,,,,0
SibSp,1309,,,,0.4989,1.042,0.0,0.0,0.0,1.0,8.0,0


In [8]:
train_dataset =full_dataset[full_dataset['Survived'].notnull()]
test_dataset =full_dataset[full_dataset['Survived'].isnull()]
#train_dataset["Survived"] = train_dataset["Survived"].astype("int")
train_dataset.drop(labels=["Survived"],axis = 1,inplace=True)
test_dataset.drop(labels=["Survived"],axis = 1,inplace=True)

In [9]:
train_dataset.fillna(-999,inplace=True)
test_dataset.fillna(-999,inplace=True)

In [10]:
category_names =  list(train_dataset.select_dtypes(include=['object']).columns)

In [11]:
category_names

['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']

In [12]:
train_dataset.dtypes

Age            float64
Cabin           object
Embarked        object
Fare           float64
Name            object
Parch            int64
PassengerId      int64
Pclass           int64
Sex             object
SibSp            int64
Ticket          object
dtype: object

In [13]:
cate_features_index = np.where(train_dataset.dtypes == object)[0]
cate_features_index

array([ 1,  2,  4,  8, 10], dtype=int64)

In [None]:
CheckNames

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_dataset,train_y,test_size=0.20,random_state=42)

In [121]:
model = CatBoostClassifier( iterations =100 ,loss_function='Logloss',eval_metric='Accuracy',
                           use_best_model=True,random_seed=42,class_weights=[0.72,0.28],
                           colsample_bylevel=0.72,
                           bagging_temperature=.6,
                           max_depth=4,
                           )

In [122]:
model.fit(X_train,y_train,cat_features=cate_features_index,eval_set=(X_val,y_val))

Learning rate set to 0.201726
0:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 37.7ms	remaining: 3.73s
1:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 55.2ms	remaining: 2.7s
2:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 80.9ms	remaining: 2.61s
3:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 105ms	remaining: 2.52s
4:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 129ms	remaining: 2.46s
5:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 154ms	remaining: 2.41s
6:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 179ms	remaining: 2.38s
7:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 210ms	remaining: 2.41s
8:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 237ms	remaining: 2.4s
9:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 263ms	remaining: 2.37s
10:	learn: 0.8894406	test: 0.8596346	best: 0.8596346 (0)	total: 287ms	remaining: 2.32s
11:	learn: 0.8908593	t

97:	learn: 0.9093028	test: 0.8621262	best: 0.8741694 (65)	total: 2.81s	remaining: 57.4ms
98:	learn: 0.9085934	test: 0.8621262	best: 0.8741694 (65)	total: 2.84s	remaining: 28.7ms
99:	learn: 0.9093028	test: 0.8592193	best: 0.8741694 (65)	total: 2.86s	remaining: 0us

bestTest = 0.8741694377
bestIteration = 65

Shrink model to first 66 iterations.


<catboost.core.CatBoostClassifier at 0x1cd8819cfd0>

In [123]:
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,roc_auc_score,mean_squared_error,f1_score,recall_score,precision_score

def model_evalution(model,x_train,y_train,x_test,y_test):
    print("####################### model Evalution started #######################")
    train_pre = model.predict(x_train)
    test_pre = model.predict(x_test)
    train_pro = model.predict_proba(x_train)
    test_pro = model.predict_proba(x_test)

    print("Train Accuracy: {0} \t Test Accuracy: {1}".format(accuracy_score(y_train, train_pre),accuracy_score(y_test,test_pre)))
    print("Train Loss: {0} \t Test Loss: {1}".format(mean_squared_error(y_train, train_pre),mean_squared_error(y_test,test_pre)))
    print("Train AUC: {0} \t Test AUC: {1}".format(roc_auc_score(y_train, train_pro[:,1]),roc_auc_score(y_test,test_pro[:,1])))
    print("Train F1: {0} \t Test F1: {1}".format(f1_score(y_train, train_pre),f1_score(y_test,test_pre)))
    print("Train recall: {0} \t Test recall: {1}".format(recall_score(y_train, train_pre),recall_score(y_test,test_pre)))
    print("Train precision: {0} \t Test Precision: {1}".format(precision_score(y_train, train_pre),precision_score(y_test,test_pre)))
    print("Train Confusion Matrix: \n{0} \n Test Confusion Matrix: \n{1}".format(confusion_matrix(y_train, train_pre),confusion_matrix(y_test,test_pre)))
  #f1_score

In [124]:
model_evalution(model,X_train, y_train,X_val,y_val)

####################### model Evalution started #######################
Train Accuracy: 0.8932584269662921 	 Test Accuracy: 0.7932960893854749
Train Loss: 0.10674157303370786 	 Test Loss: 0.20670391061452514
Train AUC: 0.9817676818609654 	 Test AUC: 0.8942728442728441
Train F1: 0.8382978723404255 	 Test F1: 0.6890756302521008
Train recall: 0.7350746268656716 	 Test recall: 0.5540540540540541
Train precision: 0.9752475247524752 	 Test Precision: 0.9111111111111111
Train Confusion Matrix: 
[[439   5]
 [ 71 197]] 
 Test Confusion Matrix: 
[[101   4]
 [ 33  41]]


In [104]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(y_val,model.predict(X_val))))

the test accuracy is :0.804469


In [98]:
test_y =model.predict(test_dataset)

In [99]:
submission =pd.DataFrame({'PassengerId':test_X["PassengerId"].values ,'Survived':test_y}) 
import datetime
FORMAT = '%Y%m%d%H%M%S'
timestamp=datetime.datetime.now().strftime(FORMAT)
filename ="Titanic_CatBoost_Pred_"+timestamp+"_out.csv"
submission.to_csv(filename,index=False)