In [50]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [51]:
df = pd.read_csv("/Users/justinchan/Documents/HKU/Yr5/Sem1/STAT2604/Impact_of_Remote_Work_on_Mental_Health.csv")

In [52]:
df

Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,EMP0001,32,Non-binary,HR,Healthcare,13,Hybrid,47,7,2,Medium,Depression,No,Decrease,1,Unsatisfied,1,Weekly,Good,Europe
1,EMP0002,40,Female,Data Scientist,IT,3,Remote,52,4,1,Medium,Anxiety,No,Increase,3,Satisfied,2,Weekly,Good,Asia
2,EMP0003,59,Non-binary,Software Engineer,Education,22,Hybrid,46,11,5,Medium,Anxiety,No,No Change,4,Unsatisfied,5,,Poor,North America
3,EMP0004,27,Male,Software Engineer,Finance,20,Onsite,32,8,4,High,Depression,Yes,Increase,3,Unsatisfied,3,,Poor,Europe
4,EMP0005,49,Male,Sales,Consulting,32,Onsite,35,12,2,High,,Yes,Decrease,3,Unsatisfied,3,Weekly,Average,North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,EMP4996,32,Male,Sales,Consulting,4,Onsite,24,2,5,High,Burnout,Yes,Decrease,4,Neutral,1,Weekly,Average,Asia
4996,EMP4997,39,Female,Sales,Healthcare,27,Onsite,48,15,1,Low,Depression,Yes,Decrease,1,Satisfied,1,,Average,Africa
4997,EMP4998,42,Female,Sales,Healthcare,21,Hybrid,34,1,4,High,Burnout,No,Increase,3,Satisfied,1,Daily,Poor,Oceania
4998,EMP4999,27,Female,Sales,Healthcare,26,Remote,58,0,5,Low,,Yes,Increase,3,Unsatisfied,4,Daily,Average,Asia


In [55]:
# Data Manipulation

# Combine different mental health conditions (depression, anxiety, etc.) into yes / no, then name the column as Mental_Health_Problem
df["Mental_Health_Problem"] = ["No" if df["Mental_Health_Condition"].iloc[i] == "None" else "Yes" for i in range(len(df)) ]

# Group different ages together, with ages of 10 as one group
df["Age_Group"] = [math.floor(df["Age"].iloc[i] / 10) for i in range(len(df))]

# Group different years of experiences together, with one group every 5 years
df["Experience_Group"] = [math.floor(df["Years_of_Experience"].iloc[i] / 5) + 1 for i in range(len(df))]

df

Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region,Mental_Health_Problem,Age_Group,Experience_Group
0,EMP0001,32,Non-binary,HR,Healthcare,13,Hybrid,47,7,2,Medium,Depression,No,Decrease,1,Unsatisfied,1,Weekly,Good,Europe,Yes,3,3
1,EMP0002,40,Female,Data Scientist,IT,3,Remote,52,4,1,Medium,Anxiety,No,Increase,3,Satisfied,2,Weekly,Good,Asia,Yes,4,1
2,EMP0003,59,Non-binary,Software Engineer,Education,22,Hybrid,46,11,5,Medium,Anxiety,No,No Change,4,Unsatisfied,5,,Poor,North America,Yes,5,5
3,EMP0004,27,Male,Software Engineer,Finance,20,Onsite,32,8,4,High,Depression,Yes,Increase,3,Unsatisfied,3,,Poor,Europe,Yes,2,5
4,EMP0005,49,Male,Sales,Consulting,32,Onsite,35,12,2,High,,Yes,Decrease,3,Unsatisfied,3,Weekly,Average,North America,No,4,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,EMP4996,32,Male,Sales,Consulting,4,Onsite,24,2,5,High,Burnout,Yes,Decrease,4,Neutral,1,Weekly,Average,Asia,Yes,3,1
4996,EMP4997,39,Female,Sales,Healthcare,27,Onsite,48,15,1,Low,Depression,Yes,Decrease,1,Satisfied,1,,Average,Africa,Yes,3,6
4997,EMP4998,42,Female,Sales,Healthcare,21,Hybrid,34,1,4,High,Burnout,No,Increase,3,Satisfied,1,Daily,Poor,Oceania,Yes,4,5
4998,EMP4999,27,Female,Sales,Healthcare,26,Remote,58,0,5,Low,,Yes,Increase,3,Unsatisfied,4,Daily,Average,Asia,No,2,6


In [57]:
# Perform chi-square test by different group to test for independence

# category
group_lst = ["Gender", "Job_Role", "Industry", "Region", "Age_Group", "Experience_Group"]

# mental health related variables, used to perform analysis against work_location
mental_health_lst = ["Work_Life_Balance_Rating", "Stress_Level", "Mental_Health_Condition", "Social_Isolation_Rating", "Sleep_Quality", "Mental_Health_Problem"]

for mental_health_indicator in mental_health_lst:
    for group in group_lst:
        by_group_lst = df[group].unique()
        for group_type in by_group_lst:
            by_group_df = df[df[group] == group_type]
            contingency_table = pd.crosstab(by_group_df['Work_Location'], by_group_df[mental_health_indicator])
            chi2, p, dof, expected = chi2_contingency(contingency_table)
            if p < 0.05:
                print(f"p-value of {group_type} in {group} by comparing work_location and {mental_health_indicator}: {p}")
                new_df = by_group_df.loc[:, ["Work_Location", mental_health_indicator]]
                new_df = pd.crosstab(by_group_df["Work_Location"], by_group_df[mental_health_indicator])
                print(new_df, "\n")

    

p-value of Non-binary in Gender by comparing work_location and Work_Life_Balance_Rating: 0.015351191034135275
Work_Life_Balance_Rating   1   2    3   4   5
Work_Location                                
Hybrid                    86  68   91  73  95
Onsite                    70  93  102  62  73
Remote                    88  78   73  87  75 

p-value of Marketing in Job_Role by comparing work_location and Work_Life_Balance_Rating: 0.03414425022331987
Work_Life_Balance_Rating   1   2   3   4   5
Work_Location                               
Hybrid                    42  41  46  50  46
Onsite                    40  42  58  49  41
Remote                    60  54  44  29  41 

p-value of 5 in Age_Group by comparing work_location and Work_Life_Balance_Rating: 0.0127124395232743
Work_Life_Balance_Rating   1   2    3    4   5
Work_Location                                 
Hybrid                    85  78   84   90  94
Onsite                    78  94  104   60  79
Remote                    86  8