# Mental Health in Tech Survey Data Analysis
### Data Source: Kaggle (https://www.kaggle.com/osmi/mental-health-in-tech-survey/data)

In [1]:
# Import packages

import pandas as pd

In [13]:
# Import mental health data set

# Make sure to either 
# A) Make sure the data set is saved in the same folder as your ipython notebook
# B) Set the file path to match where you saved the survey data set on your system

mental_set = pd.read_csv("survey.csv")
mental_set = pd.DataFrame(mental_set)

In [11]:
mental_set.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,,No,Yes,Often,6-25,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,United States,,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,,No,No,Rarely,6-25,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [8]:
#What is the data row count/ shape?
print(mental_set.shape)
print()
    
#What is the distribution of the data?
print(mental_set.describe())
print()
    
#What types of data do I have?
print(mental_set.info())

(1259, 27)

                Age
count  1.259000e+03
mean   7.942815e+07
std    2.818299e+09
min   -1.726000e+03
25%    2.700000e+01
50%    3.100000e+01
75%    3.600000e+01
max    1.000000e+11

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
Timestamp                    1259 non-null object
Age                          1259 non-null int64
Gender                       1259 non-null object
Country                      1259 non-null object
state                        744 non-null object
self_employed                1241 non-null object
family_history               1259 non-null object
treatment                    1259 non-null object
work_interfere               995 non-null object
no_employees                 1259 non-null object
remote_work                  1259 non-null object
tech_company                 1259 non-null object
benefits                     1259 non-null object
care_options                 1259 non-null object
wel

# Dealing with missing data

In [14]:
#Remove the variables "Timestamp",“comments”, “state” because they will not do much for our prediction model

mental_set = mental_set.drop(['comments'], axis= 1)
mental_set = mental_set.drop(['state'], axis= 1)
mental_set = mental_set.drop(['Timestamp'], axis= 1)

mental_set.head()


Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,,No,Yes,Often,6-25,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,United States,,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,,No,No,Rarely,6-25,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


# Check for missing data & cleaning NaN values

In [15]:
# Check for missing data

mental_set.isnull().sum().max() 

264

In [18]:
## Clean NaN values from data set

# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data tpe
intFeatures = ['Age']
stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']
floatFeatures = []

# Clean the NaN's
for feature in mental_set:
    if feature in intFeatures:
        mental_set[feature] = mental_set[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        mental_set[feature] = mental_set[feature].fillna(defaultString)
    elif feature in floatFeatures:
        mental_set[feature] = mental_set[feature].fillna(defaultFloat)
    else:
        print('Error: Feature %s not recognized.' % feature)
        
# View head of data set (uncomment below)
# mental_set.head(5)  

# Removing Duplicate Gender Titles

In [20]:
# Convert genders to lower case 
gender = mental_set['Gender'].str.lower()
#print(gender)

#Select unique gender titles
gender = mental_set['Gender'].unique()

#Made gender groups
male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in mental_set.iterrows():

    if str.lower(col.Gender) in male_str:
        mental_set['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female_str:
        mental_set['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans_str:
        mental_set['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

#Get rid of bullshit
stk_list = ['A little about you', 'p']
mental_set = mental_set[~mental_set['Gender'].isin(stk_list)]

print(mental_set['Gender'].unique())

['female' 'male' 'trans']


# Dealing with missing 'Age' values

In [21]:
# Fill in  missing age values with mean age

mental_set['Age'].fillna(mental_set['Age'].median(), inplace = True)

# Replace ages < 18 and > 120 with the median age
###--- Should we lower the upper age limit maybe, 80, 90, 100, etc? ---###

s = pd.Series(mental_set['Age'])
s[s<18] = mental_set['Age'].median()
mental_set['Age'] = s
s = pd.Series(mental_set['Age'])
s[s>120] = mental_set['Age'].median()
mental_set['Age'] = s

# Create the ranges of 'Age'
mental_set['age_range'] = pd.cut(mental_set['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

# Dealing with NaN values for the feature 'self_employed' 

In [22]:
#There are only 0.014% of self employed so let's change NaN to NOT self_employed
#Replace "NaN" string from defaultString
mental_set['self_employed'] = mental_set['self_employed'].replace([defaultString], 'No')
print(mental_set['self_employed'].unique())

['No' 'Yes']


# Dealing with NaN values for the feature 'self_interfere' 

In [23]:
#There are only 0.20% of self work_interfere so let's change NaN to "Don't know
#Replace "NaN" string from defaultString

mental_set['work_interfere'] = mental_set['work_interfere'].replace([defaultString], 'Don\'t know' )
print(mental_set['work_interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes' "Don't know"]
