In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
import scipy.stats as stats
import pandas_profiling   #need to install using anaconda prompt (pip install pandas_profiling)

%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()

from matplotlib.backends.backend_pdf import PdfPages

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

<Figure size 720x540 with 0 Axes>

In [29]:
hr=pd.read_csv('HR_comma_sep.csv')

In [30]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [5]:
profile_report = pandas_profiling.ProfileReport(hr)
profile_report.to_file('profileReport.html')

In [31]:
hr = hr.drop_duplicates(keep=False)

In [4]:
hr.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

In [32]:
hr.shape

(9653, 10)

In [33]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9653 entries, 783 to 10571
Data columns (total 10 columns):
satisfaction_level       9653 non-null float64
last_evaluation          9653 non-null float64
number_project           9653 non-null int64
average_montly_hours     9653 non-null int64
time_spend_company       9653 non-null int64
Work_accident            9653 non-null int64
left                     9653 non-null int64
promotion_last_5years    9653 non-null int64
department               9653 non-null object
salary                   9653 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 829.6+ KB


In [34]:
def sal_class(x):
    if x == "low":
        return 1
    elif x == "medium":
        return 2
    elif x == "high":
        return 3

In [35]:
hr['sal_class'] = hr['salary'].apply(sal_class)

In [36]:
hr=hr.drop(['salary'],axis=1)

In [37]:
num_vars=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years','sal_class']
cat_vars=['department']

In [38]:
data_num=hr[num_vars]
data_cat=hr[cat_vars]

In [39]:
#Missing vlaue treatment
def missings_treat(x):
    x = x.fillna(x.median())
    return x

#Handling Outliers - Method2
def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x

data_num=data_num.apply(lambda x: missings_treat(x))
data_num=data_num.apply(lambda x: outlier_capping(x))

  
  if __name__ == '__main__':


In [40]:
hr = pd.concat([data_num, data_cat], axis=1)

In [41]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sal_class,department
783,0.36,0.46,2,132,3,0,1,0,1,sales
784,0.44,0.57,2,131,3,0,1,0,1,sales
785,0.85,0.99,5,248,5,0,1,0,1,sales
786,0.78,0.93,5,225,5,0,1,0,1,sales
787,0.39,0.46,2,156,3,0,1,0,1,sales


In [42]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9653 entries, 783 to 10571
Data columns (total 10 columns):
satisfaction_level       9653 non-null float64
last_evaluation          9653 non-null float64
number_project           9653 non-null int64
average_montly_hours     9653 non-null int64
time_spend_company       9653 non-null int64
Work_accident            9653 non-null int64
left                     9653 non-null int64
promotion_last_5years    9653 non-null int64
sal_class                9653 non-null int64
department               9653 non-null object
dtypes: float64(2), int64(7), object(1)
memory usage: 829.6+ KB


In [43]:
hr = pd.get_dummies(hr, ['department'], drop_first=True)

In [44]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sal_class,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
783,0.36,0.46,2,132,3,0,1,0,1,0,0,0,0,0,0,1,0,0
784,0.44,0.57,2,131,3,0,1,0,1,0,0,0,0,0,0,1,0,0
785,0.85,0.99,5,248,5,0,1,0,1,0,0,0,0,0,0,1,0,0
786,0.78,0.93,5,225,5,0,1,0,1,0,0,0,0,0,0,1,0,0
787,0.39,0.46,2,156,3,0,1,0,1,0,0,0,0,0,0,1,0,0


In [45]:
#Information value calculation
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [46]:
for col in hr.columns:
    if col == 'left': continue
    else:
        print('WoE and IV for column: {}'.format(col))
        df, iv = calculate_woe_iv(hr, col, 'left')
        #print(df)
        print('IV score: {:.2f}'.format(iv))
        print('\n')

WoE and IV for column: satisfaction_level
IV score: 1.69


WoE and IV for column: last_evaluation
IV score: 0.87


WoE and IV for column: number_project
IV score: 2.25


WoE and IV for column: average_montly_hours
IV score: 0.63


WoE and IV for column: time_spend_company
IV score: 1.14


WoE and IV for column: Work_accident
IV score: 0.12


WoE and IV for column: promotion_last_5years
IV score: 0.00


WoE and IV for column: sal_class
IV score: 0.09


WoE and IV for column: department_RandD
IV score: 0.00


WoE and IV for column: department_accounting
IV score: 0.00


WoE and IV for column: department_hr
IV score: 0.00


WoE and IV for column: department_management
IV score: 0.00


WoE and IV for column: department_marketing
IV score: 0.00


WoE and IV for column: department_product_mng
IV score: 0.00


WoE and IV for column: department_sales
IV score: 0.00


WoE and IV for column: department_support
IV score: 0.00


WoE and IV for column: department_technical
IV score: 0.00




In [None]:
#important variables, satisfaction_level, last_evaluation, number_project, average_montly_hours, time_spend_company

In [None]:
#univariate regression

In [47]:
gini_df = pd.DataFrame()
for col in hr.columns.difference(['left']):
    model = smf.logit('left~'+str(col), data=hr).fit()
    gini = 2*metrics.roc_auc_score(hr.left, model.predict(hr))-1
    temp = pd.DataFrame([col, gini]).T
    gini_df = pd.concat([gini_df, temp], axis=0)

Optimization terminated successfully.
         Current function value: 0.345243
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.348599
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350624
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350643
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350646
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350647
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350647
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350637
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350644
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.350647
  

LinAlgError: Singular matrix

In [48]:
gini_df

Unnamed: 0,0,1
0,Work_accident,0.109992
0,average_montly_hours,0.0933196
0,department_RandD,0.00519127
0,department_accounting,0.00197586
0,department_hr,0.000925933
0,department_management,0.000112126
0,department_marketing,9.93921e-05
0,department_product_mng,0.00324098
0,department_sales,0.00369639
0,department_support,0.00139732


In [49]:
gini_df.columns= ['Feature', 'SomerceD']
gini_df.sort_values(by='SomerceD', ascending=False, inplace=True)

In [50]:
gini_df

Unnamed: 0,Feature,SomerceD
0,Work_accident,0.109992
0,average_montly_hours,0.0933196
0,last_evaluation,0.0304829
0,number_project,0.00759513
0,department_RandD,0.00519127
0,department_sales,0.00369639
0,department_product_mng,0.00324098
0,department_accounting,0.00197586
0,department_support,0.00139732
0,department_technical,0.00139462


In [51]:
#RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
X = hr[hr.columns.difference(['left'])]
y = hr[['left']]

rfe = RFE(RandomForestClassifier(), 5)
rfe = rfe.fit(X, y)

X.columns[rfe.get_support()]

  y = column_or_1d(y, warn=True)


Index(['average_montly_hours', 'last_evaluation', 'number_project',
       'satisfaction_level', 'time_spend_company'],
      dtype='object')

In [52]:
#SelectKbest
from sklearn.feature_selection import SelectKBest, f_classif
SKB = SelectKBest(f_classif, k=5).fit(X, y )

X.columns[SKB.get_support()]

  y = column_or_1d(y, warn=True)
  f = msb / msw


Index(['Work_accident', 'average_montly_hours', 'sal_class',
       'satisfaction_level', 'time_spend_company'],
      dtype='object')

In [68]:
final_list=[#'average_montly_hours',
            #'last_evaluation',
            'number_project',
            'satisfaction_level',
            #'time_spend_company'
]

In [69]:
X_new = X[final_list]

In [70]:
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]
vif["features"] = X_new.columns

In [71]:
vif.sort_values(by='VIF_Factor',ascending=False)

Unnamed: 0,VIF_Factor,features
0,4.934503,number_project
1,4.934503,satisfaction_level


In [72]:
data_final = pd.concat([X_new, y], axis=1)

In [73]:
train, test = train_test_split(data_final, test_size=0.3, random_state=123)

In [75]:
model = smf.logit('left~number_project+satisfaction_level', train).fit()

Optimization terminated successfully.
         Current function value: 0.305153
         Iterations 7


In [76]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:                   left   No. Observations:                 6757
Model:                          Logit   Df Residuals:                     6754
Method:                           MLE   Df Model:                            2
Date:                Tue, 18 Feb 2020   Pseudo R-squ.:                  0.1323
Time:                        00:43:37   Log-Likelihood:                -2061.9
converged:                       True   LL-Null:                       -2376.2
Covariance Type:            nonrobust   LLR p-value:                3.325e-137
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              0.7201      0.176      4.084      0.000       0.374       1.066
number_project        -0.1241      0.033     -3.723      0.000      -0.189      -0.059
satisfaction_level  

In [77]:
2*metrics.roc_auc_score(train.left, model.predict())-1

0.4845835198300903

In [78]:
2*metrics.roc_auc_score(test.left, model.predict(test))-1

0.45070562259928004

In [79]:
print(metrics.roc_auc_score(train.left, model.predict()))

#somerceD = 2*AUC-1 - Test
print(metrics.roc_auc_score(test.left, model.predict(test)))

0.7422917599150451
0.72535281129964


In [81]:
np.mean(train.left)

0.11247595086576884

In [82]:
np.linspace(0,1)

array([0.        , 0.02040816, 0.04081633, 0.06122449, 0.08163265,
       0.10204082, 0.12244898, 0.14285714, 0.16326531, 0.18367347,
       0.20408163, 0.2244898 , 0.24489796, 0.26530612, 0.28571429,
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])

In [83]:
train['prob' ]= model.predict(train)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,number_project,satisfaction_level,left,prob
2900,5,0.88,0,0.027432
9272,3,0.95,0,0.02629
6525,4,0.75,0,0.052039
6474,4,0.6,0,0.093039
8974,5,0.15,0,0.371583


In [84]:
#Method-2: Find the best cut-off based on highest sensitivity + Specicity
temp = train
roc_df = pd.DataFrame()
for cut_off in np.linspace(0,1):
    temp['y_pred'] = np.where(train.prob>cut_off, 1, 0)
    temp['TP'] = np.where(((train.left ==1) & (train.y_pred==1)), 1,0)
    temp['TN'] = np.where(((train.left ==0) & (train.y_pred==0)), 1,0)
    temp['FP'] = np.where(((train.left ==0) & (train.y_pred==1)), 1,0)
    temp['FN'] = np.where(((train.left ==1) & (train.y_pred==0)), 1,0)
    sensitivity = temp.TP.sum()/(temp.TP.sum()+temp.FN.sum())
    specificity = temp.TN.sum()/(temp.TN.sum()+temp.FP.sum())
    accuracy = (temp.TN.sum()+temp.TP.sum())/(temp.y_pred.count())
    temp_df = pd.DataFrame([cut_off, sensitivity, specificity, accuracy]).T
    temp_df.columns = ['cutoff', 'sensitivity', 'specificity', 'accuracy']
    roc_df = pd.concat([roc_df, temp_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/panda

In [86]:
roc_df['total'] = roc_df.sensitivity + roc_df.specificity

In [87]:
roc_df.sort_values(by = 'total', ascending=False).head(1)

Unnamed: 0,cutoff,sensitivity,specificity,accuracy,total
0,0.183673,0.702632,0.867267,0.848749,1.569899


In [88]:
roc_df[roc_df.total == roc_df.total.max()]

Unnamed: 0,cutoff,sensitivity,specificity,accuracy,total
0,0.183673,0.702632,0.867267,0.848749,1.569899


In [94]:
train['y_pred' ]= np.where(model.predict(train)>0.2,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [95]:
print(metrics.classification_report(train.left, train.y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      5997
           1       0.40      0.64      0.50       760

    accuracy                           0.85      6757
   macro avg       0.68      0.76      0.71      6757
weighted avg       0.89      0.85      0.87      6757



In [98]:
test['y_pred' ]= np.where(model.predict(test)> 0.2,1,0)

test['prob' ]= model.predict(test)

print(metrics.classification_report(test.left, test.y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      2575
           1       0.40      0.63      0.49       321

    accuracy                           0.85      2896
   macro avg       0.67      0.75      0.70      2896
weighted avg       0.89      0.85      0.87      2896



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [99]:
train['Deciles']=pd.qcut(train['prob'],10, labels=False)
test['Deciles']=pd.qcut(test['prob'],10, labels=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [100]:

train.head(10)

Unnamed: 0,number_project,satisfaction_level,left,prob,y_pred,TP,TN,FP,FN,Deciles
2900,5,0.88,0,0.027432,0,0,1,0,0,1
9272,3,0.95,0,0.02629,0,0,1,0,0,1
6525,4,0.75,0,0.052039,0,0,1,0,0,3
6474,4,0.6,0,0.093039,0,0,1,0,0,5
8974,5,0.15,0,0.371583,1,0,1,0,0,9
7369,4,0.22,0,0.333331,1,0,1,0,0,9
3450,4,0.89,0,0.029717,0,0,1,0,0,1
5231,4,0.6,0,0.093039,0,0,1,0,0,5
2508,5,0.83,0,0.033575,0,0,1,0,0,2
6964,3,0.89,0,0.03351,0,0,1,0,0,1


In [101]:
train['goods'] = 1-train.left
test['goods'] = 1-test.left

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [102]:

decile_results_train = train.groupby(['Deciles']).agg(min_prob = ('prob', 'min'),
                              max_prob = ('prob', 'max'),
                              No_bads = ('left', 'sum'),
                              No_goods = ('goods', 'sum'), 
                              total = ('left', 'count'))

decile_results_test = test.groupby(['Deciles']).agg(min_prob = ('prob', 'min'),
                              max_prob = ('prob', 'max'),
                              No_bads = ('left', 'sum'),
                              No_goods = ('goods', 'sum'), 
                              total = ('left', 'count'),)

In [103]:
decile_results_train

Unnamed: 0_level_0,min_prob,max_prob,No_bads,No_goods,total
Deciles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.015508,0.026265,12,667,679
1,0.02629,0.033542,69,639,708
2,0.033575,0.042707,55,598,653
3,0.042748,0.056308,59,639,698
4,0.056361,0.071235,8,635,643
5,0.071301,0.093039,8,675,683
6,0.093123,0.120754,9,669,678
7,0.12086,0.172574,6,658,664
8,0.172717,0.288524,321,363,684
9,0.288728,0.503363,213,454,667


In [104]:
decile_results_train.to_csv('decile_results_train.csv')
decile_results_test.to_csv('decile_results_test.csv')