#### Scenario
A healthcare provider initiated an attrition analysis. They wish to know their current attrition rate and, if the rate is greater than ten percent, what are the contributing factors and potential solutions.

#### Problem Statement
How can we increase employee retention while minimizing costs?

In [1]:
from pathlib import Path
from math import ceil

import pandas as pd
from pandasql import sqldf

import utils

P_SQL = lambda q: sqldf(q, globals())

In [2]:
# IMPORT FILE
filename = 'data/healthcare_attrition.csv'
df = pd.read_csv(Path(utils.root(), filename), sep=',')

In [3]:
# ATTRITION RATE
attrition_sr = df['Attrition'] # isolating Attrition column

attrition_count = attrition_sr.value_counts()['Yes']
num_of_employees = attrition_sr.count()

attrition_rate_percentage = round(attrition_count / num_of_employees, 2) * 100
print(f'The attrition rate is approximately {attrition_rate_percentage}%')

The attrition rate is approximately 13.0%


In [4]:
# ATTRITION BY AGE

attrition_by_age_query = '''
SELECT *
FROM df
WHERE attrition = 'Yes'
'''

resultset = P_SQL(attrition_by_age_query)
attrition_age_sr = resultset['Age']

# The empirical rule states that 68% of data are within one stddev from the mean. We check here to identify the true spread within one stddev from mean.
empirical_rule_68 = utils.stats(attrition_age_sr)['empirical_rule_68']

population_size = resultset['EmployeeID'].count()
num_of_data_within_one_stddev = sum(i >= empirical_rule_68[0] and i <= empirical_rule_68[1] for i in attrition_age_sr)

percentage_of_data_within_one_stddev = num_of_data_within_one_stddev / population_size

print(f'''{percentage_of_data_within_one_stddev}% of attrition occurs between the ages of {empirical_rule_68[0]} and {empirical_rule_68[1]} (1 standard dev from mean).''')

# This sample will be the focus of analysis, so we refactor the dataframe to reflect that here
attrition_by_age_query = f'''
SELECT *
FROM df
WHERE attrition = 'Yes'
  AND age BETWEEN {empirical_rule_68[0]} AND {empirical_rule_68[1]}
ORDER BY age
'''

df = P_SQL(attrition_by_age_query)

0.7286432160804021% of attrition occurs between the ages of 22 and 40 (1 standard dev from mean).
