In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import itertools

In [2]:
#Read survey data 
df = pd.read_csv("/Users/joevorbeck/Documents/Data-Analytics-MA-Thesis/data.csv")

In [3]:
#Check out the df
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,"If you live in the United States, which state or territory do you live in?",Are you self-employed?,Do you have a family history of mental illness?,Have you sought treatment for a mental health condition?,"If you have a mental health condition, do you feel that it interferes with your work?",How many employees does your company or organization have?,...,How easy is it for you to take medical leave for a mental health condition?,Do you think that discussing a mental health issue with your employer would have negative consequences?,Do you think that discussing a physical health issue with your employer would have negative consequences?,Would you be willing to discuss a mental health issue with your coworkers?,Would you be willing to discuss a mental health issue with your direct supervisor(s)?,Would you bring up a mental health issue with a potential employer in an interview?,Would you bring up a physical health issue with a potential employer in an interview?,Do you feel that your employer takes mental health as seriously as physical health?,Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?,Any additional notes or comments
0,8/27/2014 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,8/27/2014 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,8/27/2014 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,8/27/2014 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,8/27/2014 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
#Count unique values - majority of the columnns can be one hot encoded
#Need binary format to check correlationsb
pd.DataFrame(df.nunique())

Unnamed: 0,0
Timestamp,1249
Age,53
Gender,49
Country,48
"If you live in the United States, which state or territory do you live in?",45
Are you self-employed?,2
Do you have a family history of mental illness?,2
Have you sought treatment for a mental health condition?,2
"If you have a mental health condition, do you feel that it interferes with your work?",4
How many employees does your company or organization have?,6


In [5]:
#Rename column for us region
df.rename(columns = {"If you live in the United States, which state or territory do you live in?" : "US_Region"},inplace = True)

In [6]:
#US makes up overwhelming majority with UK second 
#Found alot of lit review on mental health in the workplace for the UKb
#df['Country'].value_counts()

In [7]:
#Correlation of categorical data
#Also dropping the timestamps and the notes/comments field, would be interesting for text analysis later
df_cat = df.drop(['Timestamp', 'Any additional notes or comments'], axis = 1)

In [8]:
#Column values
country_list = df_cat['Country']
region_list = df_cat['US_Region']
gender_list = df_cat['Gender']

In [9]:
#Bin values
country_list_binned = [x if x in ['United States',"United Kingdom"] else "Other" for x in country_list]
region_list_binned = [x if x in ['NY','CA'] else "Other" for x in region_list]
gender_list_binned = [x if x in ['Male', 'Female'] else "Other" for x in gender_list]

In [10]:
#Drop old values and replace df with new ones for country and us region
df_cat_onehot = df_cat.drop(['Country','US_Region'], axis = 1)
df_cat_onehot['Country'] = country_list_binned
df_cat_onehot['US_Region'] = region_list_binned
df_cat_onehot['Gender'] = gender_list_binned

In [11]:
#Onehot encode the data
df_cat_onehot2 = pd.get_dummies(df_cat_onehot)

In [12]:
#Create dataframe of correlated values and rename the correlation value column
corr_df = pd.DataFrame(df_cat_onehot2.corr().unstack().sort_values(ascending = False).drop_duplicates()).reset_index()
corr_df.rename(columns = {0 : "corr"}, inplace = True)

In [13]:
#Create a filtered dataframe of the correlation values - correlations >= .2 and correlations <= -.2
corr_df_filter = pd.concat([corr_df[corr_df['corr'] >= .3], corr_df[corr_df['corr'] <= -.3]], axis = 0)

In [14]:
#Write out csv
corr_df_filter.to_csv("/Users/joevorbeck/desktop/corr_df.csv")

In [15]:
                                ########## TECH VS. NON-TECH COMPANIES ############

In [16]:
#Tech vs. non tech companies & mental health
tech_neg_consq = pd.crosstab(df_cat_onehot2['Is your employer primarily a tech company/organization?_Yes'],
                        df_cat_onehot2['Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes'])
tech_neg_consq

Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes,0,1
Is your employer primarily a tech company/organization?_Yes,Unnamed: 1_level_1,Unnamed: 2_level_1
0,161,68
1,806,225


In [17]:
chi2_stat, p_val, dof, ex = stats.chi2_contingency(tech_neg_consq)
print("Chi Square")
print(chi2_stat)
print("P Value")#
print(p_val)

Chi Square
6.070945928863042
P Value
0.013742334501733563


In [18]:
#Tech vs. non tech companies & mental health
tech_ment_phys_health = pd.crosstab(df_cat_onehot2['Is your employer primarily a tech company/organization?_Yes'],
                        df_cat_onehot2['Do you feel that your employer takes mental health as seriously as physical health?_No'])
tech_ment_phys_health

Do you feel that your employer takes mental health as seriously as physical health?_No,0,1
Is your employer primarily a tech company/organization?_Yes,Unnamed: 1_level_1,Unnamed: 2_level_1
0,142,87
1,777,254


In [19]:
chi2_stat, p_val, dof, ex = stats.chi2_contingency(tech_ment_phys_health)
print("Chi Square")
print(chi2_stat)
print("P Value")
print(p_val)

Chi Square
16.261161952805054
P Value
5.518353631574255e-05


In [20]:
#Tech vs. non tech companies & mental health
tech_ment_health_conseq = pd.crosstab(df_cat_onehot2['Is your employer primarily a tech company/organization?_Yes'],
                        df_cat_onehot2['Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes'])
tech_ment_health_conseq

Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes,0,1
Is your employer primarily a tech company/organization?_Yes,Unnamed: 1_level_1,Unnamed: 2_level_1
0,184,45
1,891,140


In [21]:
chi2_stat, p_val, dof, ex = stats.chi2_contingency(tech_ment_health_conseq)
print("Chi Square")
print(chi2_stat)
print("P Value")
print(p_val)

Chi Square
5.040275605443921
P Value
0.024764549751542176


In [22]:
                                ########## AUTOMATED CHI SQUARE ############

In [23]:
#Create a loop to automatically cross tab 2 combinations of variables
#Calculate the chi sq statistic and p value

list_1 = df_cat_onehot2.columns
list_2 = df_cat_onehot2.columns
list_of_vals = list()

for i,j in itertools.product(list_1,list_2):
    chi_df = pd.crosstab(df_cat_onehot2[i], df_cat_onehot2[j])
    chi2_stat, p_val, dof, ex = stats.chi2_contingency(chi_df)
    vals = (i,j, chi2_stat, p_val)
    list_of_vals.append(vals)
    

result_df = pd.DataFrame(list_of_vals)

In [24]:
result_df.to_csv("/Users/joevorbeck/Desktop/auto_chi_sq.csv")

In [25]:
                        ########## FURTHER INVESTIGATION ON TECH VS.NON ############
                                ########## TRYING TO ISOLATE DV ############

In [26]:
tech_vs_non = df_cat_onehot2[['Is your employer primarily a tech company/organization?_Yes',
    'Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes',
   'Would you be willing to discuss a mental health issue with your coworkers?_No',
   'Would you be willing to discuss a mental health issue with your direct supervisor(s)?_No',
   'Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes']]

In [27]:
total = tech_vs_non.groupby('Is your employer primarily a tech company/organization?_Yes')['Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes'].count().reset_index().drop('Is your employer primarily a tech company/organization?_Yes', axis = 1)

In [28]:
total.rename(columns = {'Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes' :
                        'total'}, inplace = True)

In [29]:
tech_non_neg_cons = tech_vs_non.groupby('Is your employer primarily a tech company/organization?_Yes')['Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes'].sum().reset_index()
tech_non_neg_cons['%_of_tot'] = tech_non_neg_cons['Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes'] / total['total']
tech_non_neg_cons

Unnamed: 0,Is your employer primarily a tech company/organization?_Yes,Do you think that discussing a mental health issue with your employer would have negative consequences?_Yes,%_of_tot
0,0,68,0.296943
1,1,225,0.218235


In [30]:
tech_non_disc_cowor = tech_vs_non.groupby('Is your employer primarily a tech company/organization?_Yes')['Would you be willing to discuss a mental health issue with your coworkers?_No'].sum().reset_index()
tech_non_disc_cowor['%_of_tot'] = tech_non_disc_cowor['Would you be willing to discuss a mental health issue with your coworkers?_No'] / total['total']
tech_non_disc_cowor

Unnamed: 0,Is your employer primarily a tech company/organization?_Yes,Would you be willing to discuss a mental health issue with your coworkers?_No,%_of_tot
0,0,57,0.248908
1,1,203,0.196896


In [31]:
tech_non_disc_supv = tech_vs_non.groupby('Is your employer primarily a tech company/organization?_Yes')['Would you be willing to discuss a mental health issue with your direct supervisor(s)?_No'].sum().reset_index()
tech_non_disc_supv['%_of_tot'] = tech_non_disc_supv[ 'Would you be willing to discuss a mental health issue with your direct supervisor(s)?_No'] / total['total']
tech_non_disc_supv


Unnamed: 0,Is your employer primarily a tech company/organization?_Yes,Would you be willing to discuss a mental health issue with your direct supervisor(s)?_No,%_of_tot
0,0,81.0,0.353712
1,1,313.0,0.303589


In [32]:
tech_non_neg_cons_cowork = tech_vs_non.groupby('Is your employer primarily a tech company/organization?_Yes')['Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes'].sum().reset_index() 
tech_non_neg_cons_cowork['%_of_tot'] = tech_non_neg_cons_cowork['Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes'] / total['total']
tech_non_neg_cons_cowork

Unnamed: 0,Is your employer primarily a tech company/organization?_Yes,Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?_Yes,%_of_tot
0,0,45,0.196507
1,1,140,0.13579
