In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlsettings.settings import load_app_config, get_datafolder_path
from mltools.modelbuilder.supervised import SupervisedDataLoader
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
np.set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
pd.options.display.float_format = '{:,.4f}'.format
sns.set()
import logging
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logger.setLevel(logging.ERROR)

06-Oct-19 13:07:56 - DEBUG - Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [3]:
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.model_selection import validation_curve

def measure_performance(X, y, clf, show_accuracy=True,show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred = clf.predict(X) 
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format( metrics.accuracy_score(y, y_pred)) )
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y, y_pred))
    if show_confusion_matrix:
        print("Confusion matrix") 
        print(metrics.confusion_matrix(y, y_pred),)
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}"
               .format( metrics.r2_score(y, y_pred)))
    return y_pred


def plot_learning_curve(train_sizes, train_scores, validation_scores):
    plt.figure()
    plt.title("Learning Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g" ,label="Cross-validation score")
    plt.ylim(max(-3, validation_scores_mean.min() - .1), train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()


def plot_validation_curve(parameter_values, train_scores, validation_scores):
    plt.figure()
    plt.title("Validation Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(parameter_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    plt.plot(parameter_values, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(parameter_values, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()

def plot_roc(X_val,y_val,model):
    fpr, tpr, thresholds = roc_curve(y_val, model.predict_proba(X_val)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR (recall)")
    plt.title("roc_curve");
    # find threshold closest to zero:
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="threshold zero", fillstyle="none", c='k', mew=2)
    plt.legend(loc=4)
    plt.show()

def test_classifier(model,X,y,cv):
    clf = model.fit(X,y.values.ravel())
    result = clf.score(X,y.values)
    print ("Accuracy: {0:.3f}".format(result*100.0))

    train_sizes, train_scores, validation_scores = learning_curve(clf, X,y.values.ravel(),cv=cv)
    plot_learning_curve(train_sizes, train_scores, validation_scores)
    return clf,result

In [4]:
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,roc_auc_score,mean_squared_error,f1_score,recall_score,precision_score

def model_evalution(model,x_train,y_train,x_test,y_test):
    print("####################### model Evalution started #######################")
    train_pre = model.predict(x_train)
    test_pre = model.predict(x_test)
    train_pro = model.predict_proba(x_train)
    test_pro = model.predict_proba(x_test)

    print("Train Accuracy: {0} \t Test Accuracy: {1}".format(accuracy_score(y_train, train_pre),accuracy_score(y_test,test_pre)))
    print("Train Loss: {0} \t Test Loss: {1}".format(mean_squared_error(y_train, train_pre),mean_squared_error(y_test,test_pre)))
    print("Train AUC: {0} \t Test AUC: {1}".format(roc_auc_score(y_train, train_pro[:,1]),roc_auc_score(y_test,test_pro[:,1])))
    print("Train F1: {0} \t Test F1: {1}".format(f1_score(y_train, train_pre),f1_score(y_test,test_pre)))
    print("Train recall: {0} \t Test recall: {1}".format(recall_score(y_train, train_pre),recall_score(y_test,test_pre)))
    print("Train precision: {0} \t Test Precision: {1}".format(precision_score(y_train, train_pre),precision_score(y_test,test_pre)))
    print("Train Confusion Matrix: \n{0} \n Test Confusion Matrix: \n{1}".format(confusion_matrix(y_train, train_pre),confusion_matrix(y_test,test_pre)))
  #f1_score

In [5]:
load_app_config()
DATA_DIRECTORY='HRAnalytics'
TRAIN_FILE  = "train.csv"
TEST_FILE  = "test.csv"
RESPONSE = "is_promoted"
input_path = get_datafolder_path()

In [6]:
filepath  = pathlib.Path(input_path).joinpath(DATA_DIRECTORY)
train_filepath = filepath.joinpath(TRAIN_FILE)
test_filepath = filepath.joinpath(TEST_FILE)

In [7]:
supervisedloader = SupervisedDataLoader(train_file=train_filepath,test_file=test_filepath,response =RESPONSE) 
train_dataset,test_dataset = supervisedloader.load()
train_y = train_dataset[RESPONSE]

In [8]:
def get_nullcounts(dataset):
    nullcount_frame = pd.DataFrame({'Feature':dataset.columns.values,
                                    'Missing_Values': dataset.shape[0] - dataset.count().values, })
    nullcount_frame = nullcount_frame[nullcount_frame['Missing_Values']>0].reset_index(drop=True)
    nullcount_frame['Missing_Values%'] = (nullcount_frame['Missing_Values']/dataset.shape[0])*100
    return nullcount_frame

In [9]:
train_dataset.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.8306,1.253,34.8039,3.3293,5.8655,0.352,0.0232,63.3868,0.0852
std,22586.5814,0.6093,7.6602,1.26,4.2651,0.4776,0.1505,13.3716,0.2791
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [10]:
display(train_dataset.head())
display(test_dataset.head())

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


### EDA 
##### Highly imbalanced dataset

In [11]:
print("% of target variable")
train_dataset['is_promoted'].value_counts()/train_dataset.shape[0]

% of target variable


0   0.9148
1   0.0852
Name: is_promoted, dtype: float64

### Imputing missing values

In [12]:
display(get_nullcounts(train_dataset))
display(get_nullcounts(test_dataset))

Unnamed: 0,Feature,Missing_Values,Missing_Values%
0,education,2409,4.3953
1,previous_year_rating,4124,7.5244


Unnamed: 0,Feature,Missing_Values,Missing_Values%
0,education,1034,4.4019
1,previous_year_rating,1812,7.7139


In [13]:
train_rating_by_region_department =train_dataset .groupby(['region','department']) ['previous_year_rating'].median()

In [14]:
train_rating_by_region_department.head()

region    department
region_1  Analytics    4.0000
          Finance      3.0000
          HR           4.5000
          Legal        3.0000
          Operations   4.0000
Name: previous_year_rating, dtype: float64

In [15]:
test_rating_by_region_department = test_dataset.groupby(['region','department'])['previous_year_rating'].median()

In [16]:
def impute_rating(row):
    return train_rating_by_region_department[row['region'],row['department']]

In [17]:
train_dataset['previous_year_rating'].fillna(train_dataset[train_dataset['previous_year_rating'].isnull()].apply(impute_rating, axis=1), inplace=True)

In [18]:
test_dataset['previous_year_rating'].fillna(test_dataset[test_dataset['previous_year_rating'].isnull()].apply(impute_rating, axis=1), inplace=True)

In [19]:
train_education_mode = train_dataset.groupby(['department']).agg({'education':lambda x: x.value_counts(dropna=False).index[0]}).reset_index()

In [20]:
train_education_mode

Unnamed: 0,department,education
0,Analytics,Bachelor's
1,Finance,Bachelor's
2,HR,Bachelor's
3,Legal,Bachelor's
4,Operations,Bachelor's
5,Procurement,Bachelor's
6,R&D,Bachelor's
7,Sales & Marketing,Bachelor's
8,Technology,Bachelor's


In [21]:
train_dataset['education'].fillna("Bachelor's",inplace=True)
test_dataset['education'].fillna("Bachelor's",inplace=True)

In [22]:
display(get_nullcounts(train_dataset))
display(get_nullcounts(test_dataset))

Unnamed: 0,Feature,Missing_Values,Missing_Values%


Unnamed: 0,Feature,Missing_Values,Missing_Values%


In [23]:
full_dataset = pd.concat(objs=[train_dataset,test_dataset],axis=0,sort =True).reset_index(drop=True) 

In [24]:
full_dataset['department'].value_counts() 
full_dataset['education'].value_counts() 

Bachelor's          55690
Master's & above    21429
Below Secondary      1179
Name: education, dtype: int64

In [25]:
department_encoding = {'R&D':1,'Legal':1,'HR':2,'Finance':2,
                       'Analytics':3,'Technology':4,'Procurement':4,
                       'Operations':5,'Sales & Marketing':5}

education_encoding = {"Bachelor's":3,"Master's & above":2,'Below Secondary':1}

gender_encoding = {'f':0,'m':1}
recruitment_channel_encoding = {'other':3,'sourcing':2, 'referred':1}


In [26]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78298 entries, 0 to 78297
Data columns (total 14 columns):
KPIs_met >80%           78298 non-null int64
age                     78298 non-null int64
avg_training_score      78298 non-null int64
awards_won?             78298 non-null int64
department              78298 non-null object
education               78298 non-null object
employee_id             78298 non-null int64
gender                  78298 non-null object
is_promoted             54808 non-null float64
length_of_service       78298 non-null int64
no_of_trainings         78298 non-null int64
previous_year_rating    78298 non-null float64
recruitment_channel     78298 non-null object
region                  78298 non-null object
dtypes: float64(2), int64(7), object(5)
memory usage: 8.4+ MB


In [27]:
#full_dataset['department'] =full_dataset['department'].map(department_encoding)
#full_dataset['education'] =full_dataset['education'].map(education_encoding)
full_dataset['gender'] =full_dataset['gender'].map(gender_encoding)
full_dataset['recruitment_channel'] =full_dataset['recruitment_channel'].map(recruitment_channel_encoding)
full_dataset['previous_year_rating'] =full_dataset['previous_year_rating'].astype(int)

In [28]:
full_dataset = pd.get_dummies(full_dataset, columns = ["region","department","education"],prefix="d_",drop_first = True)
train_dataset['previous_year_rating'] =train_dataset['previous_year_rating'].astype(int)
full_dataset.rename(columns={'KPIs_met >80%': 'KPIs_met_GT_80', 'awards_won?': 'awards_won'}, inplace=True)

In [29]:
full_dataset.head()

Unnamed: 0,KPIs_met_GT_80,age,avg_training_score,awards_won,employee_id,gender,is_promoted,length_of_service,no_of_trainings,previous_year_rating,recruitment_channel,d__region_10,d__region_11,d__region_12,d__region_13,d__region_14,d__region_15,d__region_16,d__region_17,d__region_18,d__region_19,d__region_2,d__region_20,d__region_21,d__region_22,d__region_23,d__region_24,d__region_25,d__region_26,d__region_27,d__region_28,d__region_29,d__region_3,d__region_30,d__region_31,d__region_32,d__region_33,d__region_34,d__region_4,d__region_5,d__region_6,d__region_7,d__region_8,d__region_9,d__Finance,d__HR,d__Legal,d__Operations,d__Procurement,d__R&D,d__Sales & Marketing,d__Technology,d__Below Secondary,d__Master's & above
0,1,35,49,0,65438,0,0.0,8,1,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
1,0,30,60,0,65141,1,0.0,4,1,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,34,50,0,7513,1,0.0,7,1,3,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,39,50,0,2542,1,0.0,10,2,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,45,73,0,48945,1,0.0,2,1,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [30]:
train_dataset =full_dataset[full_dataset['is_promoted'].notnull()]
test_dataset  = full_dataset[full_dataset['is_promoted'].isnull()]
test_dataset_emp =test_dataset['employee_id']
train_y=full_dataset[full_dataset['is_promoted'].notnull()]['is_promoted']

full_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)
train_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)
test_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)


In [31]:
kfold = StratifiedKFold(n_splits=10)
from sklearn.model_selection  import  train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.preprocessing import StandardScaler
test_size = 0.25
seed = 2

X_trainmodel, X_val, y_trainmodel, y_val = train_test_split(train_dataset, train_y, test_size=test_size,random_state=seed)

In [32]:
X_trainmodel.head()


Unnamed: 0,KPIs_met_GT_80,age,avg_training_score,awards_won,gender,length_of_service,no_of_trainings,previous_year_rating,d__region_10,d__region_11,d__region_12,d__region_13,d__region_14,d__region_15,d__region_16,d__region_17,d__region_18,d__region_19,d__region_2,d__region_20,d__region_21,d__region_22,d__region_23,d__region_24,d__region_25,d__region_26,d__region_27,d__region_28,d__region_29,d__region_3,d__region_30,d__region_31,d__region_32,d__region_33,d__region_34,d__region_4,d__region_5,d__region_6,d__region_7,d__region_8,d__region_9,d__Finance,d__HR,d__Legal,d__Operations,d__Procurement,d__R&D,d__Sales & Marketing,d__Technology,d__Below Secondary,d__Master's & above
26984,1,43,68,0,1,5,1,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
668,0,39,68,0,0,5,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
44692,1,39,85,0,1,7,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5678,1,31,62,0,1,4,1,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
43038,0,44,61,0,0,17,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [33]:
from catboost import CatBoostClassifier, Pool, cv

In [34]:
catboost_model = CatBoostClassifier(iterations=400,
                                    verbose=True,eval_metric="F1",
                                    learning_rate=0.2,
                                    class_weights=[1,3],
                                    depth=3,l2_leaf_reg=100,bagging_temperature=0.5)

In [35]:
catboost_model.fit(X_trainmodel, y_trainmodel,eval_set=(X_val, y_val),use_best_model=False)

0:	learn: 0.3394313	test: 0.3514862	best: 0.3514862 (0)	total: 180ms	remaining: 1m 11s
1:	learn: 0.2421881	test: 0.2428256	best: 0.3514862 (0)	total: 219ms	remaining: 43.7s
2:	learn: 0.3153305	test: 0.3090951	best: 0.3514862 (0)	total: 257ms	remaining: 34s
3:	learn: 0.3173077	test: 0.3232699	best: 0.3514862 (0)	total: 295ms	remaining: 29.2s
4:	learn: 0.3067230	test: 0.3081272	best: 0.3514862 (0)	total: 338ms	remaining: 26.7s
5:	learn: 0.2872724	test: 0.2982332	best: 0.3514862 (0)	total: 398ms	remaining: 26.1s
6:	learn: 0.3197594	test: 0.3235093	best: 0.3514862 (0)	total: 446ms	remaining: 25.1s
7:	learn: 0.3363891	test: 0.3417807	best: 0.3514862 (0)	total: 490ms	remaining: 24s
8:	learn: 0.3257940	test: 0.3433243	best: 0.3514862 (0)	total: 536ms	remaining: 23.3s
9:	learn: 0.3251415	test: 0.3476686	best: 0.3514862 (0)	total: 580ms	remaining: 22.6s
10:	learn: 0.3579156	test: 0.3700559	best: 0.3700559 (10)	total: 630ms	remaining: 22.3s
11:	learn: 0.3556099	test: 0.3660013	best: 0.3700559 (1

98:	learn: 0.5581577	test: 0.5478816	best: 0.5479876 (97)	total: 4.3s	remaining: 13.1s
99:	learn: 0.5588945	test: 0.5518704	best: 0.5518704 (99)	total: 4.33s	remaining: 13s
100:	learn: 0.5590305	test: 0.5517640	best: 0.5518704 (99)	total: 4.37s	remaining: 12.9s
101:	learn: 0.5599740	test: 0.5496625	best: 0.5518704 (99)	total: 4.41s	remaining: 12.9s
102:	learn: 0.5588120	test: 0.5505012	best: 0.5518704 (99)	total: 4.45s	remaining: 12.8s
103:	learn: 0.5591664	test: 0.5506073	best: 0.5518704 (99)	total: 4.5s	remaining: 12.8s
104:	learn: 0.5591930	test: 0.5498746	best: 0.5518704 (99)	total: 4.53s	remaining: 12.7s
105:	learn: 0.5600833	test: 0.5506073	best: 0.5518704 (99)	total: 4.57s	remaining: 12.7s
106:	learn: 0.5633290	test: 0.5495652	best: 0.5518704 (99)	total: 4.61s	remaining: 12.6s
107:	learn: 0.5635962	test: 0.5501931	best: 0.5518704 (99)	total: 4.65s	remaining: 12.6s
108:	learn: 0.5630361	test: 0.5512449	best: 0.5518704 (99)	total: 4.7s	remaining: 12.5s
109:	learn: 0.5635833	test: 

193:	learn: 0.5848937	test: 0.5694418	best: 0.5694418 (193)	total: 8.46s	remaining: 8.98s
194:	learn: 0.5854306	test: 0.5695489	best: 0.5695489 (194)	total: 8.51s	remaining: 8.95s
195:	learn: 0.5848567	test: 0.5693348	best: 0.5695489 (194)	total: 8.57s	remaining: 8.93s
196:	learn: 0.5858560	test: 0.5693348	best: 0.5695489 (194)	total: 8.64s	remaining: 8.9s
197:	learn: 0.5858931	test: 0.5700338	best: 0.5700338 (197)	total: 8.69s	remaining: 8.87s
198:	learn: 0.5857450	test: 0.5699268	best: 0.5700338 (197)	total: 8.74s	remaining: 8.83s
199:	learn: 0.5855600	test: 0.5690141	best: 0.5700338 (197)	total: 8.78s	remaining: 8.78s
200:	learn: 0.5860130	test: 0.5697129	best: 0.5700338 (197)	total: 8.82s	remaining: 8.73s
201:	learn: 0.5864462	test: 0.5712144	best: 0.5712144 (201)	total: 8.86s	remaining: 8.69s
202:	learn: 0.5869072	test: 0.5696060	best: 0.5712144 (201)	total: 8.9s	remaining: 8.64s
203:	learn: 0.5873967	test: 0.5689072	best: 0.5712144 (201)	total: 8.95s	remaining: 8.6s
204:	learn: 0

286:	learn: 0.6021133	test: 0.5779006	best: 0.5805022 (265)	total: 12.4s	remaining: 4.87s
287:	learn: 0.6031472	test: 0.5793636	best: 0.5805022 (265)	total: 12.4s	remaining: 4.83s
288:	learn: 0.6026282	test: 0.5784729	best: 0.5805022 (265)	total: 12.5s	remaining: 4.78s
289:	learn: 0.6021479	test: 0.5784729	best: 0.5805022 (265)	total: 12.5s	remaining: 4.75s
290:	learn: 0.6025182	test: 0.5780070	best: 0.5805022 (265)	total: 12.6s	remaining: 4.71s
291:	learn: 0.6029983	test: 0.5781135	best: 0.5805022 (265)	total: 12.6s	remaining: 4.66s
292:	learn: 0.6034780	test: 0.5788989	best: 0.5805022 (265)	total: 12.6s	remaining: 4.61s
293:	learn: 0.6039201	test: 0.5788989	best: 0.5805022 (265)	total: 12.7s	remaining: 4.57s
294:	learn: 0.6041038	test: 0.5787923	best: 0.5805022 (265)	total: 12.7s	remaining: 4.53s
295:	learn: 0.6041756	test: 0.5793636	best: 0.5805022 (265)	total: 12.8s	remaining: 4.48s
296:	learn: 0.6042501	test: 0.5791506	best: 0.5805022 (265)	total: 12.8s	remaining: 4.44s
297:	learn

382:	learn: 0.6186055	test: 0.5841962	best: 0.5844085 (381)	total: 16.6s	remaining: 737ms
383:	learn: 0.6191083	test: 0.5849673	best: 0.5849673 (383)	total: 16.6s	remaining: 694ms
384:	learn: 0.6187443	test: 0.5854189	best: 0.5854189 (384)	total: 16.7s	remaining: 650ms
385:	learn: 0.6185680	test: 0.5854189	best: 0.5854189 (384)	total: 16.7s	remaining: 606ms
386:	learn: 0.6190332	test: 0.5860819	best: 0.5860819 (386)	total: 16.8s	remaining: 563ms
387:	learn: 0.6196107	test: 0.5867439	best: 0.5867439 (387)	total: 16.8s	remaining: 520ms
388:	learn: 0.6195731	test: 0.5871925	best: 0.5871925 (388)	total: 16.8s	remaining: 476ms
389:	learn: 0.6194347	test: 0.5879588	best: 0.5879588 (389)	total: 16.9s	remaining: 433ms
390:	learn: 0.6199249	test: 0.5857634	best: 0.5879588 (389)	total: 16.9s	remaining: 389ms
391:	learn: 0.6199624	test: 0.5857634	best: 0.5879588 (389)	total: 17s	remaining: 346ms
392:	learn: 0.6197490	test: 0.5856574	best: 0.5879588 (389)	total: 17s	remaining: 303ms
393:	learn: 0.

<catboost.core.CatBoostClassifier at 0x2ba0e0ee2b0>

In [36]:
model_evalution(catboost_model,X_trainmodel, y_trainmodel,X_val,y_val)

####################### model Evalution started #######################
Train Accuracy: 0.934194521481049 	 Test Accuracy: 0.9267260253977522
Train Loss: 0.06580547851895101 	 Test Loss: 0.07327397460224784
Train AUC: 0.9285960148152773 	 Test AUC: 0.9085243869095475
Train F1: 0.5585115064468745 	 Test F1: 0.5191570881226053
Train recall: 0.4905389908256881 	 Test recall: 0.45932203389830506
Train precision: 0.6483516483516484 	 Test Precision: 0.5969162995594713
Train Confusion Matrix: 
[[36690   928]
 [ 1777  1711]] 
 Test Confusion Matrix: 
[[12156   366]
 [  638   542]]


In [37]:
test_y =catboost_model.predict(test_dataset)

In [64]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# Number of trees 
iterations = [int(x) for x in np.linspace(start = 100, stop = 500, num = 4)]
# Maximum number of levels in tree
depth = [int(x) for x in np.linspace(1, 10, num = 11)]
 
# Minimum number of samples required to split a node
l2_leaf_reg = [1,2,3,5,8]
# Minimum number of samples required at each leaf node
loss_function = ['Logloss','CrossEntropy']
# Method of selecting samples for training each tree
learning_rate =[0.005,0.03,0.1]
border_count =[32,]
ctr_border_count=[50,]
thread_count=4
eval_metric =["F1"]
params = {'depth':depth,
          'iterations':iterations,
          'learning_rate':learning_rate, 
          'l2_leaf_reg':l2_leaf_reg,
          #'border_count':border_count,
          #'ctr_border_count':ctr_border_count,
          #'thread_count':thread_count,
          'eval_metric':eval_metric,}
pprint(params)

{'depth': [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'eval_metric': ['F1'],
 'iterations': [100, 233, 366, 500],
 'l2_leaf_reg': [1, 2, 3, 5, 8],
 'learning_rate': [0.005, 0.03, 0.1]}


In [65]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
cb_model = CatBoostClassifier()

randm = RandomizedSearchCV(estimator=cb_model, param_distributions = params, 
                               cv = 3, n_iter = 10, n_jobs=-1)
randm.fit(X_trainmodel, y_trainmodel)

In [68]:
print("\n========================================================")
print(" Results from Random Search " )
print("========================================================")    

print("\n The best estimator across ALL searched params:\n",
      randm.best_estimator_)

print("\n The best score across ALL searched params:\n",
      randm.best_score_)

print("\n The best parameters across ALL searched params:\n",
      randm.best_params_)

print("\n ========================================================")


 Results from Random Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x000002BA0F221EB8>

 The best score across ALL searched params:
 0.0

 The best parameters across ALL searched params:
 {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'eval_metric': 'F1', 'depth': 4}



In [69]:
# catboost_model = CatBoostClassifier(iterations=400,
#                                     verbose=True,eval_metric="F1",
#                                     learning_rate=0.1,
#                                     class_weights=[1,3],
#                                     depth=3,l2_leaf_reg=3,bagging_temperature=0.5)

In [125]:
catboost_model = CatBoostClassifier(iterations=400,
                                    verbose=True,eval_metric="F1",
                                    learning_rate=0.1,
                                    class_weights=[1,3],
                                    depth=3,l2_leaf_reg=3,bagging_temperature=0.5)

In [126]:
catboost_model.fit(X_trainmodel, y_trainmodel,eval_set=(X_val, y_val),use_best_model=False)

0:	learn: 0.3394313	test: 0.3514862	best: 0.3514862 (0)	total: 38.9ms	remaining: 15.5s
1:	learn: 0.2829921	test: 0.3005540	best: 0.3514862 (0)	total: 75.1ms	remaining: 14.9s
2:	learn: 0.3314243	test: 0.3369735	best: 0.3514862 (0)	total: 112ms	remaining: 14.8s
3:	learn: 0.3202273	test: 0.3296405	best: 0.3514862 (0)	total: 146ms	remaining: 14.5s
4:	learn: 0.3038330	test: 0.3067004	best: 0.3514862 (0)	total: 182ms	remaining: 14.3s
5:	learn: 0.3204980	test: 0.3342355	best: 0.3514862 (0)	total: 222ms	remaining: 14.6s
6:	learn: 0.2911932	test: 0.2929222	best: 0.3514862 (0)	total: 272ms	remaining: 15.3s
7:	learn: 0.3108614	test: 0.3189139	best: 0.3514862 (0)	total: 313ms	remaining: 15.3s
8:	learn: 0.3101429	test: 0.3189139	best: 0.3514862 (0)	total: 351ms	remaining: 15.3s
9:	learn: 0.3094233	test: 0.3178259	best: 0.3514862 (0)	total: 402ms	remaining: 15.7s
10:	learn: 0.3112500	test: 0.3202945	best: 0.3514862 (0)	total: 442ms	remaining: 15.6s
11:	learn: 0.3139181	test: 0.3246217	best: 0.351486

99:	learn: 0.5287262	test: 0.5243779	best: 0.5244840 (95)	total: 4.36s	remaining: 13.1s
100:	learn: 0.5284614	test: 0.5233758	best: 0.5244840 (95)	total: 4.42s	remaining: 13.1s
101:	learn: 0.5281733	test: 0.5242718	best: 0.5244840 (95)	total: 4.49s	remaining: 13.1s
102:	learn: 0.5296996	test: 0.5254237	best: 0.5254237 (102)	total: 4.55s	remaining: 13.1s
103:	learn: 0.5305623	test: 0.5254237	best: 0.5254237 (102)	total: 4.61s	remaining: 13.1s
104:	learn: 0.5302382	test: 0.5252118	best: 0.5254237 (102)	total: 4.65s	remaining: 13.1s
105:	learn: 0.5306205	test: 0.5262097	best: 0.5262097 (105)	total: 4.69s	remaining: 13s
106:	learn: 0.5310275	test: 0.5269944	best: 0.5269944 (106)	total: 4.74s	remaining: 13s
107:	learn: 0.5328185	test: 0.5268882	best: 0.5269944 (106)	total: 4.78s	remaining: 12.9s
108:	learn: 0.5322941	test: 0.5267821	best: 0.5269944 (106)	total: 4.82s	remaining: 12.9s
109:	learn: 0.5331888	test: 0.5257857	best: 0.5269944 (106)	total: 4.87s	remaining: 12.8s
110:	learn: 0.5326

195:	learn: 0.5649681	test: 0.5573008	best: 0.5573008 (195)	total: 8.69s	remaining: 9.05s
196:	learn: 0.5641861	test: 0.5582477	best: 0.5582477 (196)	total: 8.73s	remaining: 8.99s
197:	learn: 0.5649173	test: 0.5575170	best: 0.5582477 (196)	total: 8.77s	remaining: 8.94s
198:	learn: 0.5644300	test: 0.5576251	best: 0.5582477 (196)	total: 8.8s	remaining: 8.89s
199:	learn: 0.5647473	test: 0.5588691	best: 0.5588691 (199)	total: 8.84s	remaining: 8.84s
200:	learn: 0.5643564	test: 0.5587609	best: 0.5588691 (199)	total: 8.88s	remaining: 8.79s
201:	learn: 0.5640023	test: 0.5586527	best: 0.5588691 (199)	total: 8.92s	remaining: 8.75s
202:	learn: 0.5642462	test: 0.5586527	best: 0.5588691 (199)	total: 8.99s	remaining: 8.72s
203:	learn: 0.5651608	test: 0.5588691	best: 0.5588691 (199)	total: 9.06s	remaining: 8.71s
204:	learn: 0.5652344	test: 0.5587609	best: 0.5588691 (199)	total: 9.12s	remaining: 8.68s
205:	learn: 0.5647702	test: 0.5587609	best: 0.5588691 (199)	total: 9.17s	remaining: 8.63s
206:	learn:

291:	learn: 0.5805215	test: 0.5662100	best: 0.5662100 (291)	total: 12.6s	remaining: 4.67s
292:	learn: 0.5805957	test: 0.5651760	best: 0.5662100 (291)	total: 12.7s	remaining: 4.64s
293:	learn: 0.5809049	test: 0.5668124	best: 0.5668124 (293)	total: 12.8s	remaining: 4.61s
294:	learn: 0.5808678	test: 0.5659947	best: 0.5668124 (293)	total: 12.8s	remaining: 4.57s
295:	learn: 0.5801995	test: 0.5679529	best: 0.5679529 (295)	total: 12.9s	remaining: 4.52s
296:	learn: 0.5811398	test: 0.5684450	best: 0.5684450 (296)	total: 12.9s	remaining: 4.47s
297:	learn: 0.5814978	test: 0.5676292	best: 0.5684450 (296)	total: 12.9s	remaining: 4.43s
298:	learn: 0.5815349	test: 0.5668124	best: 0.5684450 (296)	total: 13s	remaining: 4.38s
299:	learn: 0.5818066	test: 0.5675214	best: 0.5684450 (296)	total: 13s	remaining: 4.34s
300:	learn: 0.5824239	test: 0.5667047	best: 0.5684450 (296)	total: 13.1s	remaining: 4.29s
301:	learn: 0.5827324	test: 0.5667047	best: 0.5684450 (296)	total: 13.1s	remaining: 4.25s
302:	learn: 0.

383:	learn: 0.5951631	test: 0.5756665	best: 0.5759458 (355)	total: 16.6s	remaining: 693ms
384:	learn: 0.5947103	test: 0.5753425	best: 0.5759458 (355)	total: 16.7s	remaining: 650ms
385:	learn: 0.5944074	test: 0.5737366	best: 0.5759458 (355)	total: 16.7s	remaining: 606ms
386:	learn: 0.5943699	test: 0.5738444	best: 0.5759458 (355)	total: 16.7s	remaining: 562ms
387:	learn: 0.5953190	test: 0.5738444	best: 0.5759458 (355)	total: 16.8s	remaining: 519ms
388:	learn: 0.5958865	test: 0.5730400	best: 0.5759458 (355)	total: 16.8s	remaining: 476ms
389:	learn: 0.5960390	test: 0.5734134	best: 0.5759458 (355)	total: 16.9s	remaining: 432ms
390:	learn: 0.5963787	test: 0.5735211	best: 0.5759458 (355)	total: 16.9s	remaining: 389ms
391:	learn: 0.5970974	test: 0.5734134	best: 0.5759458 (355)	total: 16.9s	remaining: 345ms
392:	learn: 0.5971743	test: 0.5731982	best: 0.5759458 (355)	total: 17s	remaining: 302ms
393:	learn: 0.5971743	test: 0.5731982	best: 0.5759458 (355)	total: 17s	remaining: 259ms
394:	learn: 0.

<catboost.core.CatBoostClassifier at 0x2ba0feab9b0>

In [127]:
model_evalution(catboost_model,X_trainmodel, y_trainmodel,X_val,y_val)

####################### model Evalution started #######################
Train Accuracy: 0.9367488931056294 	 Test Accuracy: 0.9321266968325792
Train Loss: 0.06325110689437065 	 Test Loss: 0.06787330316742081
Train AUC: 0.9252278708867657 	 Test AUC: 0.9089116375518072
Train F1: 0.5498614958448754 	 Test F1: 0.5240532241555783
Train recall: 0.4552752293577982 	 Test recall: 0.43389830508474575
Train precision: 0.6940559440559441 	 Test Precision: 0.661498708010336
Train Confusion Matrix: 
[[36918   700]
 [ 1900  1588]] 
 Test Confusion Matrix: 
[[12260   262]
 [  668   512]]


In [None]:
Checking

In [128]:
test_y =catboost_model.predict(test_dataset)