In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('/content/drive/My Drive/train.csv')
test = pd.read_csv('/content/drive/My Drive/test.csv')

In [3]:
train.drop(['Id', 'EmployeeNumber'], axis=1, inplace=True)
train.drop('Behaviour', axis=1, inplace=True)

In [4]:
test.drop(['Id', 'EmployeeNumber'], axis=1, inplace=True)
test.drop('Behaviour', axis=1, inplace=True)

In [5]:
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [6]:
(train.shape, test.shape)

((1000, 26), (470, 25))

In [8]:
from sklearn.preprocessing import LabelEncoder
for column in train.columns:
              if train[column].dtype == np.number:
                              continue
              train[column] = LabelEncoder().fit_transform(train[column])
for column in test.columns:
              if test[column].dtype == np.number:
                              continue
              test[column] = LabelEncoder().fit_transform(test[column])

In [9]:
count=0
columns_list=[]
for columns in train.columns:
      cm = train['Attrition'].corr(train[columns])
      if(cm>0.1 or cm<-0.1):
        print("%s --> %.4f" %(columns,cm))
        count+=1
        columns_list.append(columns)
print("Total no. of correlated columns with Attrition columns are : %d" %count)
print(columns_list)

Age --> -0.1518
Attrition --> 1.0000
JobInvolvement --> -0.1217
JobSatisfaction --> -0.1088
MaritalStatus --> 0.1476
MonthlyIncome --> -0.1972
OverTime --> 0.2524
StockOptionLevel --> -0.1299
TotalWorkingYears --> -0.1845
YearsAtCompany --> -0.1603
YearsInCurrentRole --> -0.1847
YearsWithCurrManager --> -0.1575
Total no. of correlated columns with Attrition columns are : 12
['Age', 'Attrition', 'JobInvolvement', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'OverTime', 'StockOptionLevel', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']


In [10]:
for columns in train.columns:
  if columns not in columns_list:
    train.drop(columns, axis=1, inplace=True)

In [11]:
train.shape

(1000, 12)

In [12]:
for columns in test.columns:
  if columns not in columns_list:
    test.drop(columns, axis=1, inplace=True)

In [None]:
test.shape

(470, 11)

In [13]:
X = train.drop('Attrition', axis=1)
y = train.Attrition

In [14]:
xg = xgb.XGBClassifier(subsample=0.6,colsample_bytree=1.0,min_child_weight=7,max_depth=5,
                       learning_rate=0.03,gamma=0.07,
                       scale_pos_weight=5,random_state=3)

rnd = RandomForestClassifier(n_estimators=500,min_samples_split=80,min_samples_leaf=2,
                             max_features='log2',max_depth=8,random_state=3)

lgbm = LGBMClassifier(random_state=42)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [16]:
xg_model = xg.fit(X, y)

In [17]:
xg_predict_test = xg_model.predict(X_test)
xg_predict_train = xg_model.predict(X_train)

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [19]:
print("XG Model : ")
print(accuracy_score(y_test, xg_predict_test))
print(roc_auc_score(y_test, xg_predict_test))
print(accuracy_score(y_train, xg_predict_train))
print(roc_auc_score(y_train, xg_predict_train))

XG Model : 
0.8818181818181818
0.8740491886409736
0.8895522388059701
0.8892780696633585


In [20]:
rnd_model = rnd.fit(X, y)

rnd_predict_test = rnd_model.predict(X_test)
rnd_predict_train = rnd_model.predict(X_train)

print("Random Forest model : ")
print(accuracy_score(y_test, rnd_predict_test))
print(roc_auc_score(y_test, rnd_predict_test))
print(accuracy_score(y_train, rnd_predict_train))
print(roc_auc_score(y_train, rnd_predict_train))

Random Forest model : 
0.8272727272727273
0.5086206896551724
0.8626865671641791
0.5353535353535354


In [21]:
lgbm = LGBMClassifier(random_state=42)

lgbm_model = lgbm.fit(X, y)

lgbm_predict_test = lgbm_model.predict(X_test)
lgbm_predict_train = lgbm_model.predict(X_train)

print("LGBM : ")
print(accuracy_score(y_test, lgbm_predict_test))
print(roc_auc_score(y_test, lgbm_predict_test))
print(accuracy_score(y_train, lgbm_predict_train))
print(roc_auc_score(y_train, lgbm_predict_train))

LGBM : 
1.0
1.0
0.9955223880597015
0.9890233331564329


In [30]:
import joblib

filename = 'model.pkl'
joblib.dump(lgbm_model, filename)

['model.pkl']

In [31]:
filename = 'model.pkl'
pipe = joblib.load(filename)

In [32]:
pipe.predict(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,