In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

## Data Loading

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


## Preprocessing

In [4]:
import seaborn as sns
#sns.pairplot(data)

In [5]:
# plt.figure(figsize=(12,7))
# for i, col in enumerate(data.select_dtypes(exclude='object')):
#     plt.subplot(2,5, i+1)
#     data[col].plot(kind='box')
# plt.tight_layout()

In [6]:
def preprocessing_pipeline(data, train=True):
    if train:
        data = data.dropna()
        y = data[['is_promoted']].copy()
        data = data.drop(columns=['is_promoted'])
        
    X = data.copy()
    
    # If null values are present, it is considered as 0. i.e value in new-col1 and new-col2 will be 0 
    column_for_dummies = ['education', 'gender', 'recruitment_channel']
    for i in column_for_dummies:
        new_col = pd.get_dummies(data[i], drop_first=True)
        X = X.join(new_col)
        #print(X.columns)
        
    #print(X.columns)
    column_for_label_encode = ['department', 'region']
    for i in column_for_label_encode:
        new_col = data[i].astype('category').cat.codes
        X[i + '_cat'] = new_col
     
    col_to_drop = column_for_dummies + column_for_label_encode + ['employee_id']
    X.drop(columns=col_to_drop, inplace=True)
    #print(X.columns)
    if train:
        return X, y
    else:
        return X
        

In [7]:
X,y=preprocessing_pipeline(data)

In [8]:
X.shape, y.shape

((48660, 14), (48660, 1))

Total distribution of data labels

In [9]:
np.bincount(y.values[:,0])

array([44428,  4232])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=500)

In [11]:
X_train.shape, X_test.shape

((38928, 14), (9732, 14))

In [12]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

## Model

In [13]:
reg = LogisticRegression()
model = reg.fit(X_train, y_train.reshape(-1))

## Evaluation with validation data
Metrics to consider:
- Accuracy(when class is balanced)
- Precision
- Recall
- F1 score

- False Positive Rate : $ FPR = \dfrac{FP}{FP + TN}$ 
    - Out of actual negative classes, how much is incorrectly  predicted as positive class
- False Negative Rate : $ FNR = \dfrac{FN}{FN + TP}$
    - Out of actual positive classes, how much is incorrectly predicted as negative class

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_test = sc.transform(X_test)
pred = model.predict(X_test)

In [85]:
pred_new = (model.predict_proba(X_test)[:,1] >= 0.0999).astype(bool)
np.bincount(pred_new)

array([7024, 2708])

In [86]:
tn, fp, fn, tp = confusion_matrix(pred_new, y_test).ravel()

In [87]:
## Overall precision
precision = tp/(tp+fp)
recall = tp/(tp+fn)
print(f"Precision: {precision}")
## Overall recall
print(f"Recall: {recall}")
## Overall F1 score
print(f"F1 Score: {(2 * precision * recall)/ (precision + recall) }")

Precision: 0.6759142496847415
Recall: 0.19793205317577547
F1 Score: 0.30619822907740646


In [88]:
accuracy_score(pred_new, y_test)

0.7504110152075627

In [57]:
np.bincount(pred)

array([23271,   219])

#### When threshold=0.5
- Precision is very low for  class 1. i.e Out of total positive prediction(class 1 being positive), only 8 % is actually positive (i.e promoted).
- Recall = 0.64 mean - Out of actual positive class, only 64% is correctly predicted as positive class(i.e promotions)
- Combined F1 score for class 1 is very low (Closer to 1 is better). So threshold has to be changed.

The reason being class imbalance. Class 1 has less groundtruths than class 0

In [17]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      9638
           1       0.08      0.64      0.14        94

    accuracy                           0.92      9732
   macro avg       0.54      0.78      0.55      9732
weighted avg       0.99      0.92      0.95      9732



In [18]:
confusion_matrix(pred, y_test)

array([[8905,  733],
       [  34,   60]])

In [19]:
accuracy_score(pred, y_test)

0.9211878339498561

In [20]:
tn, fp, fn, tp = confusion_matrix(pred, y_test).ravel()

In [21]:
tn,fp, fn,tp

(8905, 733, 34, 60)

In [22]:
## Overall precision
precision = tp/(tp+fp)
recall = tp/(tp+fn)
print(f"Precision: {precision}")
## Overall recall
print(f"Recall: {recall}")
## Overall F1 score
print(f"F1 Score: {(2 * precision * recall)/ (precision + recall) }")

Precision: 0.07566204287515763
Recall: 0.6382978723404256
F1 Score: 0.13528748590755355


F1 score is very low. **Very poor model with threshold=0.5**

### Precision-recall & ROC Curve

In [23]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
from plotnine import *
import plotnine

## Test data

In [24]:
data_test = pd.read_csv('test.csv')
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           23490 non-null  int64  
 1   department            23490 non-null  object 
 2   region                23490 non-null  object 
 3   education             22456 non-null  object 
 4   gender                23490 non-null  object 
 5   recruitment_channel   23490 non-null  object 
 6   no_of_trainings       23490 non-null  int64  
 7   age                   23490 non-null  int64  
 8   previous_year_rating  21678 non-null  float64
 9   length_of_service     23490 non-null  int64  
 10  KPIs_met >80%         23490 non-null  int64  
 11  awards_won?           23490 non-null  int64  
 12  avg_training_score    23490 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 2.3+ MB


In [25]:
data_test = preprocessing_pipeline(data_test, False) ## Null values in education is eliminated because of get dummies

In [26]:
data_test['previous_year_rating'].fillna(data_test['previous_year_rating'].mode()[0], inplace=True)
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   no_of_trainings       23490 non-null  int64  
 1   age                   23490 non-null  int64  
 2   previous_year_rating  23490 non-null  float64
 3   length_of_service     23490 non-null  int64  
 4   KPIs_met >80%         23490 non-null  int64  
 5   awards_won?           23490 non-null  int64  
 6   avg_training_score    23490 non-null  int64  
 7   Below Secondary       23490 non-null  uint8  
 8   Master's & above      23490 non-null  uint8  
 9   m                     23490 non-null  uint8  
 10  referred              23490 non-null  uint8  
 11  sourcing              23490 non-null  uint8  
 12  department_cat        23490 non-null  int8   
 13  region_cat            23490 non-null  int8   
dtypes: float64(1), int64(6), int8(2), uint8(5)
memory usage: 1.4 MB


In [27]:
data_test = sc.transform(data_test)



In [28]:
data_test.shape

(23490, 14)

In [29]:
pred = model.predict(data_test)

In [30]:
np.bincount(pred)

array([23271,   219])