In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics


# import dataset
df_test = pd.read_csv(r"C:\Users\BernadettKepenyes\Documents\GitHub\employee-burnout\test.csv")
df_train = pd.read_csv(r"C:\Users\BernadettKepenyes\Documents\GitHub\employee-burnout\train.csv")

df_test.head()

In [None]:
# learning about dataset
print('Test shape: ', df_test.shape, '\nTrain shape: ', df_train.shape)
print('\nDataset description: \n', df_train.describe())
print('\nUnique values: \n', df_train.nunique())

In [None]:
df_train.info()

In [None]:
# missing values
# empty cells of train data
df_train.isnull().sum()/df_train.shape[0]

In [None]:
# empty cells of test data
df_test.isnull().sum()/df_test.shape[0]

In [None]:
# drop the rows with missing burn rate
df_train.dropna(subset=["Burn Rate"], inplace = True)
df_train.shape

In [None]:
df_train.info()

In [None]:
# encoding "Date of Joining"
current_date = pd.to_datetime('today')

df_train['Date of Joining'] = pd.to_datetime(df_train['Date of Joining'])
df_test['Date of Joining'] = pd.to_datetime(df_test['Date of Joining'])

def create_days_count(data):
    return(current_date - data['Date of Joining'])

df_train['days_count'] = df_train.apply(create_days_count, axis=1)
df_train['days_count'] = df_train['days_count'].dt.days

df_test['days_count'] = df_test.apply(create_days_count, axis=1)
df_test['days_count'] = df_test['days_count'].dt.days

df_train.head()
df_test.head()

In [None]:
# identifing columns with categorical data
df_train.select_dtypes(include=['object'])

In [None]:
# encoding caterogical variables
df_train['Gender'] = df_train['Gender'].apply(lambda x: 0 if x=='Male' else 1)
df_train['Company Type'] = df_train['Company Type'].apply(lambda x: 0 if x=='Service' else 1)
df_train['WFH Setup Available'] = df_train['WFH Setup Available'].apply(lambda x: 0 if x=='No' else 1)

df_test['Gender'] = df_test['Gender'].apply(lambda x: 0 if x=='Male' else 1)
df_test['Company Type'] = df_test['Company Type'].apply(lambda x: 0 if x=='Service' else 1)
df_test['WFH Setup Available'] = df_test['WFH Setup Available'].apply(lambda x: 0 if x=='No' else 1)

df_train.head()
df_test.head()

In [None]:
# drop "Employee ID" and "Date of Joining" columns
df_train.drop(['Employee ID', 'Date of Joining'], axis=1, inplace=True)
df_test.drop(['Employee ID', 'Date of Joining'], axis=1, inplace=True)

df_train.head()

In [None]:
# missing values in "Resource Allocation" and "Mental Fatigue Score"
# imputing values (median) to the dataset, "Mental Fatigue Score" results incorrect data at 6, therefore we will only drop the NaN rows
df_train.dropna(subset=["Resource Allocation", "Mental Fatigue Score"], axis=0, inplace = True)
df_train.head()

In [None]:
# correlation table
train_corr = df_train.corr()
plt.figure(figsize=(8,5))
sns.heatmap(data=train_corr, annot=True, fmt=".2f", vmin=-1, vmax=1, cmap="summer_r")
plt.yticks(rotation=0)
plt.show()

In [None]:
# burn rate grouped by home office setup
burn_rate_WFH_Setup = df_train.groupby(['WFH Setup Available'])['Burn Rate'].mean()
print(burn_rate_WFH_Setup)

In [None]:
# burn rate grouped by resource allocation
burn_rate_resource = df_train.groupby(['Resource Allocation'])['Burn Rate'].mean()
print(burn_rate_resource)

In [None]:
# burn rate grouped by company type
burn_rate_company = df_train.groupby(['Company Type'])['Burn Rate'].mean()
print(burn_rate_company)

In [None]:
# burn rate grouped by gender
burn_rate_gender = df_train.groupby(['Gender'])['Burn Rate'].mean()
print(burn_rate_gender)

In [None]:
# employees grouped by resource allocation and home office setup
resource_allocation_gender = df_train.groupby(['WFH Setup Available', 'Resource Allocation'])['Resource Allocation'].count()
print(resource_allocation_gender)

In [None]:
# highest correlation with Mental Fatigue Score
plt.figure(figsize=(9,5))
plt.scatter("Mental Fatigue Score", "Burn Rate", data=df_train, s=0.3, c="blue")
plt.plot([0,10], [0,1], linewidth=3, c="red")
plt.xlabel("Mental Fatigue Score")
plt.ylabel("Burn Rate")
plt.xticks(range(0,11))
plt.show()

In [None]:
# correlation with "Resource Allocation"
plt.figure(figsize=(9,5))
plt.scatter("Resource Allocation", "Burn Rate", data=df_train, s=0.3, c="brown")
plt.plot([0,10], [0,1], linewidth=3, c="red")
plt.xlabel("Resource Allocation")
plt.ylabel("Burn Rate")
plt.xticks(range(0,11))
plt.show()

In [None]:
df_test.head()

In [None]:
# saving the cleaned data, we will need it later for scenario analysis
clean_df_train = df_train.copy()
clean_df_train.to_csv("clean_df_train.csv", index=False)
train_file_path = r"C:\Users\BernadettKepenyes\Documents\GitHub\employee-burnout\clean_df_train.csv"
new_df_train = pd.read_csv(train_file_path)

new_df_train.head()

In [None]:
# Linear Regression
X = df_train[['Gender','Company Type','WFH Setup Available','Designation','Resource Allocation','Mental Fatigue Score','days_count']]
y = df_train['Burn Rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)

y_train_preds = lm_model.predict(X_train)
y_test_preds = lm_model.predict(X_test)
r2_train = r2_score(y_train, y_train_preds)
r2_test = r2_score(y_test, y_test_preds)

print(r2_train, '\n', r2_test)
print(lm_model.intercept_)
print(lm_model.coef_)
print(y_train_preds)

In [None]:
# Random Forest

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

y_train_preds_rf = rf_model.predict(X_train)
y_test_preds_rf = rf_model.predict(X_test)

r2_train_rf = r2_score(y_train, y_train_preds_rf)
r2_test_rf = r2_score(y_test, y_test_preds_rf)
print(r2_train_rf, '\n', r2_test_rf)

In [None]:
# checking the prediction accuracy
y_train_preds_df = pd.DataFrame(data=y_train_preds, columns=['Prediction'], index=X_train.index.copy())

df_out = pd.merge(df_train, y_train_preds_df, how='left', left_index = True, right_index=True)
df_out.head()

In [None]:
# drop NaN
df_out.dropna(subset=["Prediction"], axis=0, inplace = True)
df_out.info()

In [None]:
# original vs predicted comparison
v = df_out['Burn Rate']
z = df_out['Prediction'] 
w = df_out['Mental Fatigue Score']

plt.scatter(w, v, color="blue", label="Original", s=0.3)
plt.scatter(w, z, color="lightgreen", label="Predicted", s=0.3)
plt.xlabel("Mental Fatigue Score")
plt.ylabel("Burn Rate / original or prediction")
plt.xticks(range(0,11))
plt.legend()
plt.show()

In [None]:
# Error statistics
print('MAE: ', metrics.mean_absolute_error(v, z))
print('MSE: ', metrics.mean_squared_error(v, z))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(v,z)))