In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mutual_info_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
    
from model_wrapper import NBWrapper

# Chapter 1: Data understanding and processing (EDA)

## 1.1 Get to know the data

In [None]:
path = "./HR_comma_sep.csv"
df = pd.read_csv(path)
df.info()

In [None]:
df.head(10)

Th dataset contains information about employees who worked in a company. These information included:
- Satisfactory Level
- Number of Project
- Average Monthly Hours
- Time Spend Company
- Promotion Last 5 Years
- Department
- Salary

The information was obtained mainly to predict employees retention - choose to stay or leave the company - defined by the "left" column 

In [None]:
df.describe().T

## 1.2 Data cleaning and preprocessing
Include handling with:
- Inconsistent in col names
- Missing values
- Duplicate values
- Identify target variable and features
- Features importance and engineering
- Outliers
- Validation framework
- Label encoding with One-hot

### inconsistent column names

In [None]:
# normailize cols name
df.columns = df.columns.str.lower()
print(list(df.columns))

### missing values

In [None]:
# missing values
df.isnull().sum()

$\to$ Luckily, no missing values were found

### duplicate values

In [None]:
# duplicate values
dups = df.duplicated().sum()
print(dups/df.shape[0])

$\to$ 20% of the data were found to be duplicates. In this particular case, we will remove all the duplicates to prevent skewed, as each instances in the data represent an employee, and each of them should be treated independently for the best prediction result.

In [None]:
# handle duplication
print(f"Before: {df.shape}")
df = df.drop_duplicates(keep='first')
print(f"After: {df.shape}")

### obtain categorical and numerical features

In [None]:
# get categorical features
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical.extend(["work_accident", "promotion_last_5years"])

# get numerical features
numerical = list(df.drop(columns=categorical).columns)
numerical.remove("left")

print(categorical, numerical)

- Target is: 'left'
- Categorical features include: 'department', 'salary', 'work_accident', 'promotion_last_5years'
- Numerical features include: 'satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company'

### imbalanced data

In [None]:
left = list(df.left.value_counts())

plt.bar(["Flase", "True"], left)
plt.show()

In [None]:
list(map(lambda x: x/df.shape[0], df.left.value_counts()))

### feature importance and engineering

In [None]:
# explore categorical features with mutual info
for col in categorical:
    print(df[col].value_counts(), "\n")
    print(f"Mutual info between retention and {col}: {mutual_info_score(df.left, df[col])}", "\n")

$\to$ Based on the result of mutual information, it is likely that salary and work_accident are more of potential features to predict employee retention than the rest. Hence, we will include these in the models.

In [None]:
# explore numerical features with correlation matrix
df[numerical].corrwith(df.left)

$\to$ The correlation matrix tells us that, only features like satisfaction_level, time_spend_company, and average_monthly_hours are likely affected employee retention. Hence, we will also include them in the models, and save the others for further investigation.

In [None]:
# features will be used in prediction
categorical = ["salary", "work_accident"]
numerical = ["satisfaction_level", "time_spend_company", "average_montly_hours"]
target = "left"

variables = categorical + numerical
variables.append(target)

### outliers

In [None]:
# detect outliers for numerical values
plt.figure(figsize=(10, 15))
for i, col in enumerate(numerical):
    plt.subplot(5, 2, i + 1)
    sns.boxplot(df, x=col)
plt.tight_layout()
plt.show()

$\to$ From the boxplots, we can observe that time_spend_company varible contains multiple outilers that need to be addressed. But before we can decide wheter or not to remove outliers, it is important to first investigate the reason behind the existence of these values. And since some models are more sensitive to outliers than other, it also depends on the type of models we choose.

### validation framework
Setting up a validation process includes:
- Full Train data (80%) use for cross validation
- Test data (20%)

In [None]:
# spliting data
df_full_train, df_test = train_test_split(df[variables], test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

In [None]:
df_full_train.head(10)

# Chapter 2: Model implementation

## 2.1 Obtain features and target values

In [None]:
def getXy(data):
    X = data.drop(columns=[target])
    y = data[target].values
    return X, y

X_train, y_train = getXy(df_full_train)
X_test, y_test = getXy(df_test)

## 2.2 Train NB variants with cross-validation

In [None]:
nb_variants = {
    'gaussian': GaussianNB(),
    'multinomial': MultinomialNB(),
    'bernoulli': BernoulliNB()
}

# init multiple models at a time
for name, model in nb_variants.items():
    nb_variants[name] = NBWrapper(model=model, target=target,
                                  num=numerical, cat=categorical)

# apply cross-validate for each model
for name, wrapper in nb_variants.items():
    score = wrapper.cross_validate(X_train, y_train)
    print(f"{name.capitalize()}: roc_auc_score = {score}")

# Chapter 3: Fine-tuning models

## 3.1 Tune with predefined hyperparameters

In [None]:
params = {
    'gaussian': {
        'classifier__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    },
    'multinomial': {
        'classifier__alpha': [0.1, 0.5, 1.0, 1.5, 2.0], 
        'classifier__fit_prior': [True, False]
    },
    'bernoulli': {
        'classifier__alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
        'classifier__binarize': [0.0, 0.5, 1.0],
        'classifier__fit_prior': [True, False]
    }
}

for name, wrapper in nb_variants.items():
    best_params = wrapper.grid_search(X=X_train, y=y_train, param_grid=params[name])
    print(f"Best Parameters for {name.capitalize()}: {best_params}")

## 3.2 Retrain with new parameters

In [None]:
nb_variants = {
    'gaussian': GaussianNB(var_smoothing=1e-06),
    'multinomial': MultinomialNB(alpha=0.1, fit_prior=True),
    'bernoulli': BernoulliNB(alpha=0.1, binarize=0.5, fit_prior=True)
}

# init multiple models at a time
for name, model in nb_variants.items():
    nb_variants[name] = NBWrapper(model=model, target=target,
                                  num=numerical, cat=categorical)

# apply cross-validate for each model
train_score = {}
for name, wrapper in nb_variants.items():
    train_score[name] = wrapper.cross_validate(X_train, y_train)
    print(f"{name.capitalize()}: roc_auc_score = {train_score[name]}")

## 3.3 Test with test dataset

In [None]:
# fit model and test with test dataset
cm = {}
test_score = {}
for name, wrapper in nb_variants.items():
    model = wrapper.model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm[name] = confusion_matrix(y_test, y_pred)
    test_score[name] = roc_auc_score(y_test, y_pred)
    print(f"{name.capitalize()}: roc_auc_score = {test_score[name]}")

# Chapter 4: Evaluation

In [None]:
models = ['gaussian', 'multinomial', 'bernoulli']
scores = {
    'train': (0.82, 0.77, 0.79),
    'test': (0.74, 0.5, 0.64)
}

# create dataframe to suite with seaborn barplot
data = []
for model, train, test in zip(models, scores['train'], scores['test']):
    data.append({'Model': model, 'Dataset': 'Train', 'ROC-AUC score': train})
    data.append({'Model': model, 'Dataset': 'Test', 'ROC-AUC score': test})

df = pd.DataFrame(data)

# plot the data
ax = sns.barplot(data=df, x='Model', y='ROC-AUC score', hue='Dataset', errorbar=None)
ax.bar_label(ax.containers[0], fontsize=10)
ax.bar_label(ax.containers[1], fontsize=10)
plt.title('Model Performance on Train and Test Datasets')
plt.show()

In [None]:
# confusion matrix
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, (name, matrix) in enumerate(cm.items()):
    sns.heatmap(matrix, annot=True, fmt='d', ax=axes[i])
    axes[i].set_title(f"{name.capitalize()}")
    
plt.tight_layout()
plt.show()