# Employ Churn Prediction Model

In [None]:
!pip install imbalanced-learn
!pip install xgboost

In [18]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

## Data

In [19]:
df = pd.read_csv('emp_data.csv')

In [20]:
df.columns  = df.columns.str.lower()
df = df[['number_project', 'work_accident', 'time_spend_company', 'department', 'last_evaluation', 'average_montly_hours',
             'salary', 'promotion_last_5years', 'satisfaction_level', 'left']]

In [21]:
df.head()

Unnamed: 0,number_project,work_accident,time_spend_company,department,last_evaluation,average_montly_hours,salary,promotion_last_5years,satisfaction_level,left
0,2,0,3,sales,0.53,157,low,0,0.38,1
1,5,0,6,sales,0.86,262,medium,0,0.8,1
2,7,0,4,sales,0.88,272,medium,0,0.11,1
3,5,0,5,sales,0.87,223,low,0,0.72,1
4,2,0,3,sales,0.52,159,low,0,0.37,1


## Feature Engineering

In [22]:
df.groupby('left').mean()

Unnamed: 0_level_0,number_project,work_accident,time_spend_company,last_evaluation,average_montly_hours,promotion_last_5years,satisfaction_level
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3.786664,0.175009,3.380032,0.715473,199.060203,0.026251,0.66681
1,3.855503,0.047326,3.876505,0.718113,207.41921,0.005321,0.440098


In [23]:
sub_df = df[['satisfaction_level','average_montly_hours','promotion_last_5years','salary','left']]
sub_df.head()

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,salary,left
0,0.38,157,0,low,1
1,0.8,262,0,medium,1
2,0.11,272,0,medium,1
3,0.72,223,0,low,1
4,0.37,159,0,low,1


In [24]:
onehot_df = pd.get_dummies(sub_df, prefix="salary")

In [25]:
onehot_df = onehot_df[['satisfaction_level', 'average_montly_hours', 'promotion_last_5years', 
                      'salary_low', 'salary_medium', 'salary_high', 'left']]

In [26]:
X = onehot_df.drop('left',axis='columns')


In [27]:
y = onehot_df[['left']]

### Balancing Dataset with SMOTE

In [28]:
sm = SMOTE(random_state=42)

In [29]:
X,y = sm.fit_resample(X,y)

In [30]:
y.value_counts()

left
0       11428
1       11428
dtype: int64

### Split Dataset

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

## Build Model

In [33]:
model = XGBClassifier(random_state=0)

In [34]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [35]:
y_pred = model.predict(X_test)

In [36]:
model.score(X_test,y_test)

0.929706868893102

## Evaluate Model Performance

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      3462
           1       0.93      0.93      0.93      3395

    accuracy                           0.93      6857
   macro avg       0.93      0.93      0.93      6857
weighted avg       0.93      0.93      0.93      6857



In [38]:
cm = confusion_matrix(y_test,y_pred)

In [39]:
print(cm)

[[3207  255]
 [ 227 3168]]


In [40]:
### Create a Pickle file using serialization 
import pickle
pickle_path = open("emp-model.pkl","wb")
pickle.dump(model, pickle_path)
pickle_path.close()