In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from scipy.stats import ttest_1samp
from scipy.stats import chi2_contingency

In [2]:
survey = pd.read_csv('survey_clean.csv')
survey.columns

Index(['age', 'Gender', 'Country', 'self_employed', 'family_history',
       'treatment', 'work_interfere', 'no_employees', 'remote_work',
       'tech_company', 'benefits', 'care_options', 'wellness_program',
       'seek_help', 'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'age_range'],
      dtype='object')

In [3]:
survey['age'].mean()

32.07673860911271

**Hypothesis testing**

In [30]:
#CL = 0.95
#alpha=0.05
# ttest_1samp(a, popmean, axis=0, nan_policy='propagate')
# a -  sample
# popmean - the value we want to test our sample against
## axis
#nan_policy - how to handle NaNs
            # 'propagate' returns nan if there are nans
            # 'raise' returns an error if there are nans
            # 'omit' ignore the nans and calculate using the existing numbers

#H0: the average age of a tech worker is 38 years old - visier study ( 330,000 Anonymous Employee Records Revealed)
#H1: the average age is different 

statistic, p_value = ttest_1samp(survey['age'], 38, nan_policy='omit')

print('The test statistic is {0} \nThe p-value of the test is {1} '.format(statistic, p_value))

The test statistic is -28.74517719834334 
The p-value of the test is 6.566438722561636e-140 


In [None]:
#p_value < alpha: 
# We can reject  (p-value << 5%) the null hypothesis of the avg age of a tech worker = 38 $30/hr

**Chi squared test of independence between treatment and family_history**


In [14]:
fam_treatment = survey[['family_history','treatment']]
fam_treatment

Unnamed: 0,family_history,treatment
0,No,Yes
1,No,No
2,No,No
3,Yes,Yes
4,No,No
...,...,...
1246,No,Yes
1247,Yes,Yes
1248,Yes,Yes
1249,No,No


In [25]:
table = pd.crosstab(survey["family_history"], survey["treatment"])
table

treatment,No,Yes
family_history,Unnamed: 1_level_1,Unnamed: 2_level_1
No,492,270
Yes,127,362


In [33]:
table.columns = ["Treatment No","Treatment Yes"]
table.index = ["Family History No", "Family History Yes"]
table

Unnamed: 0,Treatment No,Treatment Yes
Family History No,492,270
Family History Yes,127,362


In [34]:
#CL= 0.95
#alpha=0.05
chi2, p, dof, ex = stats.chi2_contingency(table)
print('The test statistic is {0} \nThe p-value of the test is {1} \nDegrees of freedom = {2} '.format(chi2,p,dof))

The test statistic is 175.95516961872426 
The p-value of the test is 3.703610823400622e-40 
Degrees of freedom = 1 


In [None]:
#p_value < alpha
# We can reject  (p-value << 5%) the null hypothesis. Searching for treatment depends whether you have a family historu of mental problems.

**Chi squared test of independence between treatment and benefits**

In [43]:
ben_treatment = survey[['benefits','treatment']]
ben_treatment

Unnamed: 0,benefits,treatment
0,Yes,Yes
1,Don't know,No
2,No,No
3,No,Yes
4,Yes,No
...,...,...
1246,No,Yes
1247,Yes,Yes
1248,Yes,Yes
1249,No,No


In [44]:
table = pd.crosstab(survey["benefits"], survey["treatment"])
table

treatment,No,Yes
benefits,Unnamed: 1_level_1,Unnamed: 2_level_1
Don't know,256,151
No,192,179
Yes,171,302


In [45]:
table.columns = ["Treatment No","Treatment Yes"]
table.index = ["Benefits Don't know", "Benefits No", "Benefits Yes"]
table

Unnamed: 0,Treatment No,Treatment Yes
Benefits Don't know,256,151
Benefits No,192,179
Benefits Yes,171,302


In [46]:
#The null hypothesis is that seeking treatment is independent of the mental health benefits provided by the employer
#CL= 0.95
#alpha=0.05
chi2, p, dof, ex = stats.chi2_contingency(table)
print('The test statistic is {0} \nThe p-value of the test is {1} \nDegrees of freedom = {2} '.format(chi2,p,dof))

The test statistic is 63.696948152795784 
The p-value of the test is 1.4736130252077323e-14 
Degrees of freedom = 2 


In [None]:
#p_value < alpha
# We can reject  (p-value << 5%) the null hypothesis. Searching for treatment depends whether your employer provide mental health benefits.


**Chi squared test of independence between treatment and gender**

In [49]:
gen_treatment = survey[['Gender','treatment']]
gen_treatment

Unnamed: 0,Gender,treatment
0,female,Yes
1,male,No
2,male,No
3,male,Yes
4,male,No
...,...,...
1246,male,Yes
1247,male,Yes
1248,male,Yes
1249,female,No


In [50]:
table = pd.crosstab(survey["Gender"], survey["treatment"])
table

treatment,No,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,77,170
male,538,448
non-binary,4,14


In [52]:
table.columns = ["Treatment No","Treatment Yes"]
table.index = ["Female", "Male", "Non-binary"]
table

Unnamed: 0,Treatment No,Treatment Yes
Female,77,170
Male,538,448
Non-binary,4,14


In [53]:
#The null hypothesis is that seeking treatment is independent of the employees gender
#CL= 0.95
#alpha=0.05
chi2, p, dof, ex = stats.chi2_contingency(table)
print('The test statistic is {0} \nThe p-value of the test is {1} \nDegrees of freedom = {2} '.format(chi2,p,dof))

The test statistic is 48.65692242551921 
The p-value of the test is 2.7182133562776043e-11 
Degrees of freedom = 2 


In [None]:
#p_value < alpha
# We can reject  (p-value << 5%) the null hypothesis. Searching for treatment depends on the employees gender