# Mental Health in Tech Analysis - fairness metrics

In [41]:
import pandas as pd
df = pd.read_csv( "omsi_cleaned.csv", sep = ",")

In [14]:
df

Unnamed: 0,age,gender,race,mental_health,year
0,45,Male,White,Don't Know,2020
1,24,female,,Yes,2020
2,46,Male,,Yes,2020
3,25,Female,,Yes,2020
4,25,F,,Don't Know,2020
...,...,...,...,...,...
828,36,Male,White,Possibly,2023
829,44,male,White,Yes,2023
830,53,Female,White,Yes,2023
831,62,male,White,Yes,2023


In [4]:

# Filter the DataFrame to include only rows with 'No' mental_health
no_mental_health = df[df['mental_health'] == 'No']

# Group by year and count the occurrences
mental_health_count_no_by_year = no_mental_health.groupby('year').size().reset_index(name='No')

# Group by year and count the occurrences for 'Yes' mental_health
mental_health_count_yes_by_year = df[df['mental_health'] == 'Yes'].groupby('year').size().reset_index(name='Yes')

# Merge the two DataFrames on 'year' column
merged_df = pd.merge(mental_health_count_yes_by_year, mental_health_count_no_by_year, on='year', how='outer')

# Fill missing values with 0
merged_df.fillna(0, inplace=True)

print("DataFrame with counts of 'Yes' and 'No' mental_health grouped by year:")
print(merged_df)

DataFrame with counts of 'Yes' and 'No' mental_health grouped by year:
   year  Yes     No
0  2019  147  104.0
1  2020   51   58.0
2  2021   43   50.0
3  2022   46   74.0
4  2023    5    0.0


In [5]:
# Group by year and count the occurrences for 'Yes' and 'No' mental_health
mental_health_count_by_year = df.groupby(['year', 'mental_health']).size().unstack(fill_value=0)

# Calculate the total count of people for each year
total_count_by_year = df.groupby('year').size()

# Calculate the percentage of 'Yes' and 'No' mental_health for each year
percentage_by_year = mental_health_count_by_year.div(total_count_by_year, axis=0) * 100

print("Percentage of 'Yes' and 'No' mental_health in each year:")
print(percentage_by_year)


Percentage of 'Yes' and 'No' mental_health in each year:
mental_health  Don't Know         No   Possibly        Yes
year                                                      
2019             7.386364  29.545455  21.306818  41.761364
2020            13.333333  32.222222  26.111111  28.333333
2021            10.687023  38.167939  18.320611  32.824427
2022            14.024390  45.121951  12.804878  28.048780
2023             0.000000   0.000000  16.666667  83.333333


In [26]:
# Filter the DataFrame to include only rows with 'No' mental_health
no_mental_health = df[df['mental_health'] == 'No']

# Group by year and count the occurrences of 'No' mental_health
mental_health_count_no_by_year = no_mental_health.groupby('year').size()

# Calculate the total count of people for each year
total_count_by_year = df.groupby('year').size()

# Calculate the percentage of people saying 'No' for each year
percentage_no_by_year = 100- (mental_health_count_no_by_year / total_count_by_year) * 100

print("Percentage of people saying not 'No' mental_health in each year:")
print(percentage_no_by_year)

Percentage of people saying not 'No' mental_health in each year:
year
2019    70.454545
2020    67.777778
2021    61.832061
2022    54.878049
2023          NaN
dtype: float64


SDP metrics

Clean gender

In [15]:
# Convert gender values to lowercase
df['gender'] = df['gender'].str.lower()
df.loc[df['gender'] == 'f', 'gender'] = 'female'
df.loc[df['gender'] == 'm', 'gender'] = 'male'

gender
no (female/male) - total (female/male)

In [31]:
filtered_df = df[df['mental_health'] == 'No']

# Calculate the total count of male and female respondents
no_total_male = filtered_df[filtered_df['gender'] == 'male'].shape[0]
no_total_female = filtered_df[filtered_df['gender'] == 'female'].shape[0]
# Calculate the total count of male and female respondents
total_male = (df['gender'] == 'male').sum()
total_female = (df['gender'] == 'female').sum()

# Calculate the ratio of male to female respondents
SDP_gender = no_total_female / no_total_male - total_female / total_male
print (SDP_gender)

-0.12761121856866536


age
no (young/old) - total (young/old)

In [39]:
young_df = df[df['age'] <= 35]
old_df = df[df['age'] >35]

# Calculate the total count of male and female respondents
no_young_df = young_df[young_df['mental_health'] == 'No'].shape[0]
no_old_df = old_df[old_df['mental_health'] == 'No'].shape[0]
# Calculate the total count of male and female respondents
total_young = young_df.shape[0]
total_old = old_df.shape[0]

# Calculate the ratio of male to female respondents
SDP_age = no_old_df / no_young_df - total_old / total_young
print (no_young_df)
print (total_young)
print (no_old_df)
print (total_old)
print (SDP_age)

147
455
139
378
0.11480900052328624


race 
no (nonwhite/white) - total (nonwhite/white)

In [37]:
non_white_df = df[(df['race'] != "White") & (df['race'].notna())]

white_df = df[df['race'] =="White" ]

# Calculate the total count of male and female respondents
no_non_white_df = non_white_df[non_white_df['mental_health'] == 'No'].shape[0]
no_white_df = white_df[white_df['mental_health'] == 'No'].shape[0]
# Calculate the total count of male and female respondents
total_non_white = non_white_df.shape[0]
total_white = white_df.shape[0]

# Calculate the ratio of male to female respondents
SDP_race = no_non_white_df / no_white_df - total_non_white / total_white
print (no_non_white_df)
print (total_non_white)
print (no_white_df)
print (total_white)
print (SDP_race)

30
73
67
310
0.21227732306210884
