In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlsettings.settings import load_app_config, get_datafolder_path
from mltools.modelbuilder.supervised import SupervisedDataLoader
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
np.set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
pd.options.display.float_format = '{:,.4f}'.format
sns.set()
import logging
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logger.setLevel(logging.ERROR)

05-Oct-19 21:46:49 - DEBUG - Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [3]:
from sklearn import metrics
from sklearn.metrics import roc_curve
def measure_performance(X, y, clf, show_accuracy=True,show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred = clf.predict(X) 
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format( metrics.accuracy_score(y, y_pred)) )
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y, y_pred))
    if show_confusion_matrix:
        print("Confusion matrix") 
        print(metrics.confusion_matrix(y, y_pred),)
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}"
               .format( metrics.r2_score(y, y_pred)))
    return y_pred

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
def plot_learning_curve(train_sizes, train_scores, validation_scores):
    plt.figure()
    plt.title("Learning Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g" ,label="Cross-validation score")
    plt.ylim(max(-3, validation_scores_mean.min() - .1), train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()

from sklearn.model_selection import validation_curve
def plot_validation_curve(parameter_values, train_scores, validation_scores):
    plt.figure()
    plt.title("Validation Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(parameter_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    plt.plot(parameter_values, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(parameter_values, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()

def plot_roc(X_val,y_val,model):
    fpr, tpr, thresholds = roc_curve(y_val, model.predict_proba(X_val)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR (recall)")
    plt.title("roc_curve");
    # find threshold closest to zero:
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="threshold zero", fillstyle="none", c='k', mew=2)
    plt.legend(loc=4)
    plt.show()

In [4]:
load_app_config()
DATA_DIRECTORY='HRAnalytics'
TRAIN_FILE  = "train.csv"
TEST_FILE  = "test.csv"
RESPONSE = "is_promoted"
input_path = get_datafolder_path()

In [5]:
filepath  = pathlib.Path(input_path).joinpath(DATA_DIRECTORY)
train_filepath = filepath.joinpath(TRAIN_FILE)
test_filepath = filepath.joinpath(TEST_FILE)

In [6]:
supervisedloader = SupervisedDataLoader(train_file=train_filepath,test_file=test_filepath,response =RESPONSE) 
train_dataset,test_dataset = supervisedloader.load()
train_y = train_dataset[RESPONSE]

In [7]:
def get_nullcounts(dataset):
    nullcount_frame = pd.DataFrame({'Feature':dataset.columns.values,
                                    'Missing_Values': dataset.shape[0] - dataset.count().values, })
    nullcount_frame = nullcount_frame[nullcount_frame['Missing_Values']>0].reset_index(drop=True)
    nullcount_frame['Missing_Values%'] = (nullcount_frame['Missing_Values']/dataset.shape[0])*100
    return nullcount_frame

In [8]:
train_dataset.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.8306,1.253,34.8039,3.3293,5.8655,0.352,0.0232,63.3868,0.0852
std,22586.5814,0.6093,7.6602,1.26,4.2651,0.4776,0.1505,13.3716,0.2791
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [9]:
display(train_dataset.head())
display(test_dataset.head())

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


### EDA 
##### Highly imbalanced dataset

In [10]:
print("% of target variable")
train_dataset['is_promoted'].value_counts()/train_dataset.shape[0]

% of target variable


0   0.9148
1   0.0852
Name: is_promoted, dtype: float64

### Imputing missing values

In [11]:
display(get_nullcounts(train_dataset))
display(get_nullcounts(test_dataset))

Unnamed: 0,Feature,Missing_Values,Missing_Values%
0,education,2409,4.3953
1,previous_year_rating,4124,7.5244


Unnamed: 0,Feature,Missing_Values,Missing_Values%
0,education,1034,4.4019
1,previous_year_rating,1812,7.7139


In [12]:
train_rating_by_region_department =train_dataset .groupby(['region','department']) ['previous_year_rating'].median()

In [13]:
train_rating_by_region_department.head()

region    department
region_1  Analytics    4.0000
          Finance      3.0000
          HR           4.5000
          Legal        3.0000
          Operations   4.0000
Name: previous_year_rating, dtype: float64

In [14]:
test_rating_by_region_department = test_dataset.groupby(['region','department'])['previous_year_rating'].median()

In [15]:
def impute_rating(row):
    return train_rating_by_region_department[row['region'],row['department']]

In [16]:
train_dataset['previous_year_rating'].fillna(train_dataset[train_dataset['previous_year_rating'].isnull()].apply(impute_rating, axis=1), inplace=True)

In [17]:
test_dataset['previous_year_rating'].fillna(test_dataset[test_dataset['previous_year_rating'].isnull()].apply(impute_rating, axis=1), inplace=True)

In [18]:
train_education_mode = train_dataset.groupby(['department']).agg({'education':lambda x: x.value_counts(dropna=False).index[0]}).reset_index()

In [19]:
train_education_mode

Unnamed: 0,department,education
0,Analytics,Bachelor's
1,Finance,Bachelor's
2,HR,Bachelor's
3,Legal,Bachelor's
4,Operations,Bachelor's
5,Procurement,Bachelor's
6,R&D,Bachelor's
7,Sales & Marketing,Bachelor's
8,Technology,Bachelor's


In [20]:
train_dataset['education'].fillna("Bachelor's",inplace=True)
test_dataset['education'].fillna("Bachelor's",inplace=True)

In [21]:
display(get_nullcounts(train_dataset))
display(get_nullcounts(test_dataset))

Unnamed: 0,Feature,Missing_Values,Missing_Values%


Unnamed: 0,Feature,Missing_Values,Missing_Values%


In [22]:
full_dataset = pd.concat(objs=[train_dataset,test_dataset],axis=0,sort =True).reset_index(drop=True) 

In [23]:
full_dataset['department'].value_counts() 
full_dataset['education'].value_counts() 

Bachelor's          55690
Master's & above    21429
Below Secondary      1179
Name: education, dtype: int64

In [24]:
department_encoding = {'R&D':1,'Legal':1,'HR':2,'Finance':2,
                       'Analytics':3,'Technology':4,'Procurement':4,
                       'Operations':5,'Sales & Marketing':5}

education_encoding = {"Bachelor's":3,"Master's & above":2,'Below Secondary':1}

gender_encoding = {'f':0,'m':1}
recruitment_channel_encoding = {'other':3,'sourcing':2, 'referred':1}


In [25]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78298 entries, 0 to 78297
Data columns (total 14 columns):
KPIs_met >80%           78298 non-null int64
age                     78298 non-null int64
avg_training_score      78298 non-null int64
awards_won?             78298 non-null int64
department              78298 non-null object
education               78298 non-null object
employee_id             78298 non-null int64
gender                  78298 non-null object
is_promoted             54808 non-null float64
length_of_service       78298 non-null int64
no_of_trainings         78298 non-null int64
previous_year_rating    78298 non-null float64
recruitment_channel     78298 non-null object
region                  78298 non-null object
dtypes: float64(2), int64(7), object(5)
memory usage: 8.4+ MB


In [26]:
#full_dataset['department'] =full_dataset['department'].map(department_encoding)
#full_dataset['education'] =full_dataset['education'].map(education_encoding)
full_dataset['gender'] =full_dataset['gender'].map(gender_encoding)
full_dataset['recruitment_channel'] =full_dataset['recruitment_channel'].map(recruitment_channel_encoding)
full_dataset['previous_year_rating'] =full_dataset['previous_year_rating'].astype(int)

In [27]:
full_dataset = pd.get_dummies(full_dataset, columns = ["region","department","education"],prefix="d_",drop_first = True)
train_dataset['previous_year_rating'] =train_dataset['previous_year_rating'].astype(int)
full_dataset.rename(columns={'KPIs_met >80%': 'KPIs_met_GT_80', 'awards_won?': 'awards_won'}, inplace=True)

In [28]:
full_dataset.head()

Unnamed: 0,KPIs_met_GT_80,age,avg_training_score,awards_won,employee_id,gender,is_promoted,length_of_service,no_of_trainings,previous_year_rating,recruitment_channel,d__region_10,d__region_11,d__region_12,d__region_13,d__region_14,d__region_15,d__region_16,d__region_17,d__region_18,d__region_19,d__region_2,d__region_20,d__region_21,d__region_22,d__region_23,d__region_24,d__region_25,d__region_26,d__region_27,d__region_28,d__region_29,d__region_3,d__region_30,d__region_31,d__region_32,d__region_33,d__region_34,d__region_4,d__region_5,d__region_6,d__region_7,d__region_8,d__region_9,d__Finance,d__HR,d__Legal,d__Operations,d__Procurement,d__R&D,d__Sales & Marketing,d__Technology,d__Below Secondary,d__Master's & above
0,1,35,49,0,65438,0,0.0,8,1,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
1,0,30,60,0,65141,1,0.0,4,1,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,34,50,0,7513,1,0.0,7,1,3,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,39,50,0,2542,1,0.0,10,2,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,45,73,0,48945,1,0.0,2,1,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [29]:
train_dataset =full_dataset[full_dataset['is_promoted'].notnull()]
test_dataset  = full_dataset[full_dataset['is_promoted'].isnull()]
test_dataset_emp =test_dataset['employee_id']
train_y=full_dataset[full_dataset['is_promoted'].notnull()]['is_promoted']

full_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)
train_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)
test_dataset.drop(['is_promoted','employee_id','recruitment_channel'],axis =1,inplace=True)


In [30]:
kfold = StratifiedKFold(n_splits=10)
from sklearn.model_selection  import  train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.preprocessing import StandardScaler
test_size = 0.25
seed = 2

X_trainmodel, X_val, y_trainmodel, y_val = train_test_split(train_dataset, train_y, test_size=test_size,random_state=seed)

In [31]:
X_trainmodel.head()


Unnamed: 0,KPIs_met_GT_80,age,avg_training_score,awards_won,gender,length_of_service,no_of_trainings,previous_year_rating,d__region_10,d__region_11,d__region_12,d__region_13,d__region_14,d__region_15,d__region_16,d__region_17,d__region_18,d__region_19,d__region_2,d__region_20,d__region_21,d__region_22,d__region_23,d__region_24,d__region_25,d__region_26,d__region_27,d__region_28,d__region_29,d__region_3,d__region_30,d__region_31,d__region_32,d__region_33,d__region_34,d__region_4,d__region_5,d__region_6,d__region_7,d__region_8,d__region_9,d__Finance,d__HR,d__Legal,d__Operations,d__Procurement,d__R&D,d__Sales & Marketing,d__Technology,d__Below Secondary,d__Master's & above
26984,1,43,68,0,1,5,1,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
668,0,39,68,0,0,5,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
44692,1,39,85,0,1,7,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5678,1,31,62,0,1,4,1,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
43038,0,44,61,0,0,17,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [48]:
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.model_selection import validation_curve

def measure_performance(X, y, clf, show_accuracy=True,show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred = clf.predict(X) 
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format( metrics.accuracy_score(y, y_pred)) )
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y, y_pred))
    if show_confusion_matrix:
        print("Confusion matrix") 
        print(metrics.confusion_matrix(y, y_pred),)
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}"
               .format( metrics.r2_score(y, y_pred)))
    return y_pred


def plot_learning_curve(train_sizes, train_scores, validation_scores):
    plt.figure()
    plt.title("Learning Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g" ,label="Cross-validation score")
    plt.ylim(max(-3, validation_scores_mean.min() - .1), train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()


def plot_validation_curve(parameter_values, train_scores, validation_scores):
    plt.figure()
    plt.title("Validation Curve")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)

    plt.fill_between(parameter_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    plt.plot(parameter_values, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(parameter_values, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)
    plt.legend(loc="best")
    plt.show()

def plot_roc(X_val,y_val,model):
    fpr, tpr, thresholds = roc_curve(y_val, model.predict_proba(X_val)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR (recall)")
    plt.title("roc_curve");
    # find threshold closest to zero:
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="threshold zero", fillstyle="none", c='k', mew=2)
    plt.legend(loc=4)
    plt.show()

def test_classifier(model,X,y,cv):
    clf = model.fit(X,y.values.ravel())
    result = clf.score(X,y.values)
    print ("Accuracy: {0:.3f}".format(result*100.0))

    train_sizes, train_scores, validation_scores = learning_curve(clf, X,y.values.ravel(),cv=cv)
    plot_learning_curve(train_sizes, train_scores, validation_scores)
    return clf,result

In [33]:
Checking

NameError: name 'Checking' is not defined

In [None]:
#test_y  =gbm.predict(xgb.DMatrix(test_dataset))
#X_trainmodel, X_val, y_trainmodel, y_val

In [34]:
from catboost import CatBoostClassifier, Pool, cv

In [35]:
catboost_model = CatBoostClassifier(iterations=400,
                                    verbose=True,eval_metric="F1",
                                    learning_rate=0.2,
                                    class_weights=[1,3],
                                    depth=3,l2_leaf_reg=100,bagging_temperature=0.5)

In [37]:
catboost_model.fit(X_trainmodel, y_trainmodel,eval_set=(X_val, y_val),use_best_model=False)

0:	learn: 0.3394313	test: 0.3514862	best: 0.3514862 (0)	total: 111ms	remaining: 44.3s
1:	learn: 0.2421881	test: 0.2428256	best: 0.3514862 (0)	total: 149ms	remaining: 29.6s
2:	learn: 0.3153305	test: 0.3090951	best: 0.3514862 (0)	total: 184ms	remaining: 24.4s
3:	learn: 0.3173077	test: 0.3232699	best: 0.3514862 (0)	total: 221ms	remaining: 21.9s
4:	learn: 0.3067230	test: 0.3081272	best: 0.3514862 (0)	total: 261ms	remaining: 20.6s
5:	learn: 0.2872724	test: 0.2982332	best: 0.3514862 (0)	total: 308ms	remaining: 20.2s
6:	learn: 0.3197594	test: 0.3235093	best: 0.3514862 (0)	total: 363ms	remaining: 20.4s
7:	learn: 0.3363891	test: 0.3417807	best: 0.3514862 (0)	total: 410ms	remaining: 20.1s
8:	learn: 0.3257940	test: 0.3433243	best: 0.3514862 (0)	total: 457ms	remaining: 19.9s
9:	learn: 0.3251415	test: 0.3476686	best: 0.3514862 (0)	total: 496ms	remaining: 19.3s
10:	learn: 0.3579156	test: 0.3700559	best: 0.3700559 (10)	total: 548ms	remaining: 19.4s
11:	learn: 0.3556099	test: 0.3660013	best: 0.3700559

96:	learn: 0.5574199	test: 0.5470383	best: 0.5470383 (96)	total: 4.35s	remaining: 13.6s
97:	learn: 0.5591131	test: 0.5479876	best: 0.5479876 (97)	total: 4.38s	remaining: 13.5s
98:	learn: 0.5581577	test: 0.5478816	best: 0.5479876 (97)	total: 4.42s	remaining: 13.4s
99:	learn: 0.5588945	test: 0.5518704	best: 0.5518704 (99)	total: 4.46s	remaining: 13.4s
100:	learn: 0.5590305	test: 0.5517640	best: 0.5518704 (99)	total: 4.52s	remaining: 13.4s
101:	learn: 0.5599740	test: 0.5496625	best: 0.5518704 (99)	total: 4.58s	remaining: 13.4s
102:	learn: 0.5588120	test: 0.5505012	best: 0.5518704 (99)	total: 4.64s	remaining: 13.4s
103:	learn: 0.5591664	test: 0.5506073	best: 0.5518704 (99)	total: 4.71s	remaining: 13.4s
104:	learn: 0.5591930	test: 0.5498746	best: 0.5518704 (99)	total: 4.76s	remaining: 13.4s
105:	learn: 0.5600833	test: 0.5506073	best: 0.5518704 (99)	total: 4.8s	remaining: 13.3s
106:	learn: 0.5633290	test: 0.5495652	best: 0.5518704 (99)	total: 4.83s	remaining: 13.2s
107:	learn: 0.5635962	test

190:	learn: 0.5837448	test: 0.5676133	best: 0.5681476 (188)	total: 8.52s	remaining: 9.33s
191:	learn: 0.5832911	test: 0.5683142	best: 0.5683142 (191)	total: 8.57s	remaining: 9.28s
192:	learn: 0.5836812	test: 0.5682074	best: 0.5683142 (191)	total: 8.61s	remaining: 9.24s
193:	learn: 0.5848937	test: 0.5694418	best: 0.5694418 (193)	total: 8.65s	remaining: 9.19s
194:	learn: 0.5854306	test: 0.5695489	best: 0.5695489 (194)	total: 8.69s	remaining: 9.14s
195:	learn: 0.5848567	test: 0.5693348	best: 0.5695489 (194)	total: 8.73s	remaining: 9.09s
196:	learn: 0.5858560	test: 0.5693348	best: 0.5695489 (194)	total: 8.79s	remaining: 9.06s
197:	learn: 0.5858931	test: 0.5700338	best: 0.5700338 (197)	total: 8.84s	remaining: 9.01s
198:	learn: 0.5857450	test: 0.5699268	best: 0.5700338 (197)	total: 8.87s	remaining: 8.96s
199:	learn: 0.5855600	test: 0.5690141	best: 0.5700338 (197)	total: 8.91s	remaining: 8.91s
200:	learn: 0.5860130	test: 0.5697129	best: 0.5700338 (197)	total: 8.95s	remaining: 8.87s
201:	learn

285:	learn: 0.6026686	test: 0.5748569	best: 0.5805022 (265)	total: 12.8s	remaining: 5.11s
286:	learn: 0.6021133	test: 0.5779006	best: 0.5805022 (265)	total: 12.9s	remaining: 5.06s
287:	learn: 0.6031472	test: 0.5793636	best: 0.5805022 (265)	total: 12.9s	remaining: 5.02s
288:	learn: 0.6026282	test: 0.5784729	best: 0.5805022 (265)	total: 13s	remaining: 4.97s
289:	learn: 0.6021479	test: 0.5784729	best: 0.5805022 (265)	total: 13s	remaining: 4.93s
290:	learn: 0.6025182	test: 0.5780070	best: 0.5805022 (265)	total: 13.1s	remaining: 4.89s
291:	learn: 0.6029983	test: 0.5781135	best: 0.5805022 (265)	total: 13.1s	remaining: 4.84s
292:	learn: 0.6034780	test: 0.5788989	best: 0.5805022 (265)	total: 13.1s	remaining: 4.8s
293:	learn: 0.6039201	test: 0.5788989	best: 0.5805022 (265)	total: 13.2s	remaining: 4.75s
294:	learn: 0.6041038	test: 0.5787923	best: 0.5805022 (265)	total: 13.2s	remaining: 4.71s
295:	learn: 0.6041756	test: 0.5793636	best: 0.5805022 (265)	total: 13.3s	remaining: 4.66s
296:	learn: 0.6

377:	learn: 0.6181023	test: 0.5841962	best: 0.5841962 (373)	total: 17.1s	remaining: 995ms
378:	learn: 0.6187818	test: 0.5835303	best: 0.5841962 (373)	total: 17.1s	remaining: 950ms
379:	learn: 0.6187443	test: 0.5835303	best: 0.5841962 (373)	total: 17.2s	remaining: 904ms
380:	learn: 0.6186317	test: 0.5834242	best: 0.5841962 (373)	total: 17.2s	remaining: 859ms
381:	learn: 0.6190707	test: 0.5844085	best: 0.5844085 (381)	total: 17.3s	remaining: 813ms
382:	learn: 0.6186055	test: 0.5841962	best: 0.5844085 (381)	total: 17.3s	remaining: 768ms
383:	learn: 0.6191083	test: 0.5849673	best: 0.5849673 (383)	total: 17.4s	remaining: 723ms
384:	learn: 0.6187443	test: 0.5854189	best: 0.5854189 (384)	total: 17.4s	remaining: 677ms
385:	learn: 0.6185680	test: 0.5854189	best: 0.5854189 (384)	total: 17.4s	remaining: 632ms
386:	learn: 0.6190332	test: 0.5860819	best: 0.5860819 (386)	total: 17.5s	remaining: 587ms
387:	learn: 0.6196107	test: 0.5867439	best: 0.5867439 (387)	total: 17.5s	remaining: 542ms
388:	learn

<catboost.core.CatBoostClassifier at 0x219f1e55048>

In [41]:
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,roc_auc_score,mean_squared_error,f1_score,recall_score,precision_score

In [42]:
def model_evalution(model,x_train,y_train,x_test,y_test):
    print("####################### model Evalution started #######################")
    train_pre = model.predict(x_train)
    test_pre = model.predict(x_test)
    train_pro = model.predict_proba(x_train)
    test_pro = model.predict_proba(x_test)

    print("Train Accuracy: {0} \t Test Accuracy: {1}".format(accuracy_score(y_train, train_pre),accuracy_score(y_test,test_pre)))
    print("Train Loss: {0} \t Test Loss: {1}".format(mean_squared_error(y_train, train_pre),mean_squared_error(y_test,test_pre)))
    print("Train AUC: {0} \t Test AUC: {1}".format(roc_auc_score(y_train, train_pro[:,1]),roc_auc_score(y_test,test_pro[:,1])))
    print("Train F1: {0} \t Test F1: {1}".format(f1_score(y_train, train_pre),f1_score(y_test,test_pre)))
    print("Train recall: {0} \t Test recall: {1}".format(recall_score(y_train, train_pre),recall_score(y_test,test_pre)))
    print("Train precision: {0} \t Test Precision: {1}".format(precision_score(y_train, train_pre),precision_score(y_test,test_pre)))
    print("Train Confusion Matrix: \n{0} \n Test Confusion Matrix: \n{1}".format(confusion_matrix(y_train, train_pre),confusion_matrix(y_test,test_pre)))
  #f1_score

In [43]:
model_evalution(catboost_model,X_trainmodel, y_trainmodel,X_val,y_val)

####################### model Evalution started #######################
Train Accuracy: 0.934194521481049 	 Test Accuracy: 0.9267260253977522
Train Loss: 0.06580547851895101 	 Test Loss: 0.07327397460224784
Train AUC: 0.9285960148152773 	 Test AUC: 0.9085243869095475
Train F1: 0.5585115064468745 	 Test F1: 0.5191570881226053
Train recall: 0.4905389908256881 	 Test recall: 0.45932203389830506
Train precision: 0.6483516483516484 	 Test Precision: 0.5969162995594713
Train Confusion Matrix: 
[[36690   928]
 [ 1777  1711]] 
 Test Confusion Matrix: 
[[12156   366]
 [  638   542]]


In [44]:
test_y =catboost_model.predict(test_dataset)

In [46]:
submission =pd.DataFrame({'employee_id':test_dataset_emp.values ,'is_promoted':test_y}) 
#submission['Loan_Status'].replace(0, 'N',inplace=True)
#submission['Loan_Status'].replace(1, 'Y',inplace=True)
import datetime
FORMAT = '%Y%m%d%H%M%S'
timestamp=datetime.datetime.now().strftime(FORMAT)
filename ="HR_Analytics_CatBoost"+timestamp+"_out.csv"
#submission.to_csv(filename,index=False)

In [47]:
submission.to_csv(filename,index=False)

In [None]:
# from yellowbrick.features import RFECV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import StratifiedKFold
# cv = StratifiedKFold(3)
# oz = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted')

# oz.fit(X_trainmodel, y_trainmodel)
# oz.poof()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 500, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 20, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
learning_rate =[0.005,0.05,0.08,0.01]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)