#### Scenario
A healthcare provider initiated an attrition analysis. They wish to know their current attrition rate and, if the rate is greater than ten percent, what are the contributing factors and potential solutions.

#### Problem Statement
How can we increase employee retention while minimizing costs?

In [1]:
from pathlib import Path
from math import ceil

import pandas as pd
from pandasql import sqldf

import utils

P_SQL = lambda q: sqldf(q, globals())

In [2]:
# Import file
filename = 'data/healthcare_attrition.csv'
df = pd.read_csv(Path(utils.root(), filename), sep=',')

# Constants
EMPLOYED_SAMPLE_DF = df.query("Attrition == 'No")
ATTRITION_SAMPLE_DF = df.query("Attrition == 'Yes'")

EMPLOYED_SAMPLE_SIZE = EMPLOYED_SAMPLE_DF.shape[0]
ATTRITION_SAMPLE_SIZE = ATTRITION_SAMPLE_DF.shape[0]

In [3]:
# ATTRITION RATE
attrition_sr = df['Attrition'] # isolating Attrition column

attrition_count = attrition_sr.value_counts()['Yes']
num_of_employees = attrition_sr.count()

attrition_rate_percentage = round(attrition_count / num_of_employees, 2) * 100
print(f'The attrition rate is approximately {attrition_rate_percentage}%')

Attrition Rate: 13.47%


In [4]:
# ATTRITION BY AGE
attrition_df = df.query("Attrition == 'Yes'")
attrition_population_size = ATTRITION_SAMPLE_DF['EmployeeID'].count()

attrition_age_sr = ATTRITION_SAMPLE_DF['Age'] # Isolate age column

# NOTE: The empirical rule states that 68% of data are within 1 standard deviation from the mean. The following code counts the number of data points within 1 standard deviation from the mean and divides it by the attrition count (population size)--the actual percentage of attrition that occurs within 1 standard deviation of the mean.
empirical_rule_68 = utils.stats(attrition_age_sr)['empirical_rule_68']

# Counts of data points within 1 standard deviation of the mean.
one_stddev_count = 0
for i in attrition_age_sr:
  if i >= empirical_rule_68[0] and i <= empirical_rule_68[1]:
    one_stddev_count += 1

# Percentage of data points that fall within 1 standard deviation of mean
one_stddev_percentage = round((one_stddev_count / ATTRITION_SAMPLE_SIZE) * 100, 2)

print(f'{one_stddev_percentage}% of attrition occurs between ages {empirical_rule_68[0]} and {empirical_rule_68[1]} (1 standard dev from mean).')

72.86% of attrition occurs between ages 22 and 40 (1 standard dev from mean).


In [8]:
# ATTRITION BY TRAVEL
# Null Hypothesis: 17.8% of employees who left the company frequently traveled.
# Alternate Hypothesis: > 17.8% of employee who left the company frequently traveled.
#testing if statistical significance exists
# independent var = travel
# dependent var = attrition
def travelfunc(sr: pd.Series, n: int) -> pd.DataFrame:
    sr_counts = sr.value_counts()
    df = sr_counts.to_frame(name='Count').reset_index()
    df = df.rename(columns={'index': 'Travel_Type'})
    df['Percentage_of_Observations'] = None
    
    for row in df.itertuples():
        percentage = row.Count / n
        df.at[row.Index, 'Percentage_of_Observations'] = percentage
        
    return df

travel_df = travelfunc(ATTRITION_SAMPLE_DF['BusinessTravel'], attrition_population_size)

employed_travel_sr = df.query("Attrition == 'No'")
employed_population_size = df.query("Attrition == 'No'")['EmployeeID'].count()
employed_travel_df = travelfunc(employed_travel_sr['BusinessTravel'], employed_population_size)

print(employed_travel_df)
print(travel_df)

# TODO: Test null hypothesis

         Travel_Type  Count Percentage_of_Observations
0      Travel_Rarely   1058                   0.716317
1  Travel_Frequently    263                   0.178064
2         Non-Travel    156                   0.105619
         Travel_Type  Count Percentage_of_Observations
0      Travel_Rarely    126                   0.633166
1  Travel_Frequently     57                   0.286432
2         Non-Travel     16                   0.080402
