#### Scenario
A healthcare provider initiated an attrition analysis. They wish to know their current attrition rate and, if the rate is greater than ten percent, what are the contributing factors and potential solutions.

#### Problem Statement
How can we increase employee retention while minimizing costs?

In [31]:
from pathlib import Path
from math import asin, ceil, sqrt
from typing import List

import pandas as pd
from pandasql import sqldf
from statsmodels.stats.proportion import proportions_ztest

import utils

P_SQL = lambda q: sqldf(q, globals())

In [32]:
# Import file
filename = 'data/healthcare_attrition.csv'
df = pd.read_csv(Path(utils.root(), filename), sep=',')

# Constants
EMPL_SAMPLE_DF = df.query("Attrition == 'No'") # employed
ATTR_SAMPLE_DF = df.query("Attrition == 'Yes'") # attrition

EMPL_SAMPLE_SIZE = EMPL_SAMPLE_DF.shape[0]
ATTR_SAMPLE_SIZE = ATTR_SAMPLE_DF.shape[0]

## Attrition Rate

In [33]:
# ATTRITION RATE
attrition_sr = df['Attrition'] # isolating Attrition column

attrition_count = attrition_sr.value_counts()['Yes']
num_of_employees = attrition_sr.count()

attrition_rate_percentage = round(attrition_count / num_of_employees, 2) * 100
print(f'The attrition rate is approximately {attrition_rate_percentage}%')

The attrition rate is approximately 12.0%


### <u>Attrition by Age</u>

In [34]:
# ATTRITION BY AGE
attrition_age_sr = ATTR_SAMPLE_DF['Age'] # Isolate age column

# NOTE: The empirical rule states that 68% of data are within 1 standard deviation from the mean. 
# The following code counts the number of data points within 1 standard deviation from the mean 
# and divides it by the attrition count (population size)--the actual percentage of attrition that 
# occurs within 1 standard deviation of the mean.
empirical_rule_68 = utils.stats(attrition_age_sr)['empirical_rule_68']

# Counts of data points within 1 standard deviation of the mean.
one_stddev_count = 0
for i in attrition_age_sr:
  if i >= empirical_rule_68[0] and i <= empirical_rule_68[1]:
    one_stddev_count += 1

# Percentage of data points that fall within 1 standard deviation of mean
one_stddev_percentage = round((one_stddev_count / ATTR_SAMPLE_SIZE) * 100, 2)

print(f'{one_stddev_percentage}% of attrition occurs between ages {empirical_rule_68[0]} and '
      f'{empirical_rule_68[1]} (1 standard dev from mean).')

72.86% of attrition occurs between ages 22 and 40 (1 standard dev from mean).


### <u>Attrition by Frequent Travel</u>
Write here about how you came to your conclusion, how the data is statistically relevant, and the effect size. This would include formulas for the ztest, pooled proportion, etc. Talk about how you tested for significance (and the formulas used) then tested for effect size.

Cohen H indicates small effect. h = .2: small effect, h = .5: medium effect, h = .8: large effect

In [35]:
# ATTRITION BY TRAVEL


def freq_trav_cts(*dfs) -> List:
    lst = []
    for df in dfs:
        sr = df['BusinessTravel'].value_counts()
        df_ = sr.to_frame(name='Count').reset_index().rename(columns={'index': 'Travel_Type'})
        count = df_.loc[df_['Travel_Type'] == 'Travel_Frequently'].at[1, 'Count']
        
        lst.append(count)
    
    return lst


attr_freq_trav_ct, empl_freq_trav_ct = freq_trav_cts(ATTR_SAMPLE_DF, EMPL_SAMPLE_DF)

count = [attr_freq_trav_ct, empl_freq_trav_ct]
nobs = [ATTR_SAMPLE_SIZE, EMPL_SAMPLE_SIZE]

stat, pval = proportions_ztest(count, nobs)

print(f'z-score: {round(stat, 4)}\np-value: {round(pval, 4)}')

p1 = attr_freq_trav_ct / ATTR_SAMPLE_SIZE
p2 = empl_freq_trav_ct / EMPL_SAMPLE_SIZE

def cohen_h(p1: float, p2: float) -> float:
    p1_sqrt, p2_sqrt = sqrt(p1), sqrt(p2)
    h = (asin(p1_sqrt) - asin(p2_sqrt)) * 2
    return round(abs(h), 2)

print(cohen_h(p1, p2))
# pval = 0.0003. Anything less than 0.001 is considered statistically highly significant
# testing

z-score: 3.6513
p-value: 0.0003
0.26
