In [83]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import scipy
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import expit
sns.set()
# pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Import X and Y variables for Test and Train Datasets

In [84]:
inputs_train = pd.read_csv('inputs_train.csv', index_col=0)
targets_train = pd.read_csv('targets_train.csv',index_col=0)
inputs_test = pd.read_csv('inputs_test.csv', index_col=0)
targets_test = pd.read_csv('targets_test.csv', index_col=0)

In [85]:
targets_test['EmployeeNumber'] = inputs_test['EmployeeNumber']

In [86]:
ref_categories = [ 'AGE 31-60'
                    ,'Freq_Bus_Travel'
                    ,'Commute Distance >=10mi'
                    ,'EnvironmentSatisfaction Survey Response 1'
                    ,'Job Satisfaction Survey Response 1'
                    ,'Single'
                    ,'NumCompaniesWorkedAt >=3'
                    # ,'Years At Company >=9'
                    , 'Total Working Years 8-11'
                    # ,'Years In Current Role >=13'
                    ,'Monthly Income 0-4000']

targets_train2 = targets_train.copy()
targets_train2['EmployeeNumber'] = inputs_train['EmployeeNumber']

# Remove Reference Categories

In [87]:
filtercol = ['AGE 18-30'
            ,'AGE 31-60'
            ,'Total Working Years 0-7'
            ,'Total Working Years 8-11'
            ,'Total Working Years >=12'
            ,'Freq_Bus_Travel'
            ,'Rare_Bus_Travel'
            ,'Non-Travel'
            ,'Commute Distance 0-3mi'
            ,'Commute Distance 3-9mi'
            ,'Commute Distance >=10mi'
            ,'EnvironmentSatisfaction Survey Response 1'
            ,'EnvironmentSatisfaction Survey Response 2'
            ,'EnvironmentSatisfaction Survey Response 3'
            ,'EnvironmentSatisfaction Survey Response 4'
            ,'Job Satisfaction Survey Response 1'
            ,'Job Satisfaction Survey Response 2'
            ,'Job Satisfaction Survey Response 3'
            ,'Job Satisfaction Survey Response 4'
            ,'Single'
            ,'Married'
            ,'Divorced'
            ,'Is Male'
            ,'NumCompaniesWorkedAt 0-2'
            ,'NumCompaniesWorkedAt >=3'
            ,'Overtime'
            # ,'Years At Company 0-4'
            # ,'Years At Company 5-8'
            # ,'Years At Company >=9'
            # ,'Years In Current Role 0-6'
            # ,'Years In Current Role 7-12'
            # ,'Years In Current Role >=13'
            ,'Monthly Income 0-4000'
            ,'Monthly Income 4001-6000'
            ,'Monthly Income 6001-8000'
            ,'Monthly Income >=8001']

inputs_train_w_ref_cat = inputs_train.loc[:, filtercol]

inputs_train = inputs_train_w_ref_cat.drop(ref_categories, axis=1)


# Run Regression

In [88]:
reg = LogisticRegression()
reg.fit(inputs_train, targets_train)
feature_name = inputs_train.columns.values
summary_table = pd.DataFrame(columns=['feature_name'], data=feature_name)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
print(summary_table)

                                 feature_name  Coefficients
0                                   Intercept     -0.622500
1                                   AGE 18-30     -0.628665
2                     Total Working Years 0-7     -0.422160
3                    Total Working Years >=12      0.369489
4                             Rare_Bus_Travel      0.686157
5                                  Non-Travel      1.200156
6                      Commute Distance 0-3mi      0.481802
7                      Commute Distance 3-9mi      0.384624
8   EnvironmentSatisfaction Survey Response 2      0.584597
9   EnvironmentSatisfaction Survey Response 3      0.792122
10  EnvironmentSatisfaction Survey Response 4      0.869154
11         Job Satisfaction Survey Response 2      0.540854
12         Job Satisfaction Survey Response 3      0.435474
13         Job Satisfaction Survey Response 4      1.019780
14                                    Married      0.982478
15                                   Div

  y = column_or_1d(y, warn=True)


# Build Logistic Regression

In [89]:
class LogisticRegression_with_p_values:
    def __init__(self, *args, **kwargs):
        self.model = linear_model.LogisticRegression(*args, **kwargs, max_iter=100000)

    def fit(self, X, y):
        self.model.fit(X, y)
        denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom, (X.shape[1], 1)).T
        f_ij = np.dot((X / denom).T, X)
        Cramer_Rao = np.linalg.inv(f_ij)
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0] / sigma_estimates
        p_values = [scipy.stats.norm.sf(abs(x)) * 2 for x in z_scores]
        self.p_values = p_values


reg2 = LogisticRegression_with_p_values()
reg2.fit(inputs_train, targets_train)
feature_name = inputs_train.columns.values
summary_table = pd.DataFrame(columns=['feature_name'], data=feature_name)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
p_values = reg2.p_values
# print(p_values)
p_values = np.append(np.nan, np.array(p_values))
summary_table['p_values'] = p_values
print(summary_table)
summary_table.to_excel('ModelSummary.xlsx')

                                 feature_name  Coefficients      p_values
0                                   Intercept     -0.622500           NaN
1                                   AGE 18-30     -0.628665  3.932559e-03
2                     Total Working Years 0-7     -0.422160  7.994376e-02
3                    Total Working Years >=12      0.369489  1.939061e-01
4                             Rare_Bus_Travel      0.686157  4.602279e-04
5                                  Non-Travel      1.200156  2.350791e-03
6                      Commute Distance 0-3mi      0.481802  2.013668e-02
7                      Commute Distance 3-9mi      0.384624  7.317656e-02
8   EnvironmentSatisfaction Survey Response 2      0.584597  2.377254e-02
9   EnvironmentSatisfaction Survey Response 3      0.792122  7.590805e-04
10  EnvironmentSatisfaction Survey Response 4      0.869154  2.871410e-04
11         Job Satisfaction Survey Response 2      0.540854  4.193058e-02
12         Job Satisfaction Survey Res

  y = column_or_1d(y, warn=True)


# Validate the Model

In [90]:
inputs_test_w_ref_cat = inputs_test.loc[:, filtercol]
inputs_test = inputs_test_w_ref_cat.drop(ref_categories, axis=1)
inputs_train = inputs_test_w_ref_cat.drop(ref_categories, axis=1)

'''TEST DATASET PROBABILITY OF Attrition'''
y_hat_test = reg2.model.predict(inputs_test)
# print(y_hat_test)
y_hat_test_proba = reg2.model.predict_proba(inputs_test)
# print(y_hat_test_proba)
y_hat_test_proba = y_hat_test_proba[:][:, 1]  # Change 0 back to a 1 to get the probability of No Attrition
targets_test_temp = targets_test
targets_test_temp.reset_index(drop=True, inplace=True)
actual_predicted_probs = pd.concat([targets_test_temp, pd.DataFrame(y_hat_test_proba)], axis=1)
actual_predicted_probs.columns = ['Attrition', 'EmployeeNumber', 'y_hat_test_proba']
# print(actual_predicted_probs)

'''ACCURACY OF TEST PROBABILITY'''
tr = 0.80
actual_predicted_probs['y_hat_test'] = np.where(actual_predicted_probs['y_hat_test_proba'] > tr, 1, 0)
print(pd.crosstab(actual_predicted_probs['Attrition'], actual_predicted_probs['y_hat_test'], rownames=['Actual'],
                  colnames=['Predicted']))
print((pd.crosstab(actual_predicted_probs['Attrition'], actual_predicted_probs['y_hat_test'], rownames=['Actual'],
                   colnames=['Predicted']) / actual_predicted_probs.shape[0]).iloc[0, 0]
      + (pd.crosstab(actual_predicted_probs['Attrition'], actual_predicted_probs['y_hat_test'], rownames=['Actual'],
                     colnames=['Predicted']) / actual_predicted_probs.shape[0]).iloc[1, 1])

Predicted   0    1
Actual            
0          24   15
1          55  200
0.7619047619047619
