In [13]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [14]:
# Import dataset
df = pd.read_csv("raw-data/mentalhealth.csv", index_col=False)
# original size: 292363 x 17

In [15]:
# Drop rows containing NAs
df.dropna(how='any', inplace=True)
# new size: 287162 x 17

# Subset only countries containing United States
df=df[df['Country'] == 'United States']
# new size: 168056 x 17

# Drop Unnecessary columns: Timestamp, mental health interview, country
df.drop(['Timestamp', "mental_health_interview", "Country"], axis=1, inplace=True)
df

Unnamed: 0,Gender,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,care_options
3,Female,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Yes
4,Female,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Yes
7,Female,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No
8,Female,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Not sure
9,Female,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292358,Male,Business,No,No,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Not sure
292359,Male,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Not sure
292361,Male,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No
292362,Male,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Yes


In [6]:
# Check ratios of each class
df.describe()

Unnamed: 0,Gender_bin,self_employed_bin,family_history_bin,treatment_bin,Coping_struggles_bin,Occupation_Business,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student,...,Mood_Swings_Medium,Work_Interest_Maybe,Work_Interest_No,Work_Interest_Yes,Social_Weakness_Maybe,Social_Weakness_No,Social_Weakness_Yes,care_options_No,care_options_Not sure,care_options_Yes
count,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,...,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0
mean,0.204015,0.056219,0.441365,0.541189,0.473925,0.171842,0.20827,0.226776,0.181255,0.211858,...,0.345266,0.345736,0.361743,0.292522,0.353269,0.333329,0.313401,0.332711,0.278324,0.388966
std,0.402981,0.230346,0.496551,0.498302,0.499321,0.377244,0.406072,0.418748,0.385231,0.408626,...,0.475456,0.475609,0.480506,0.454922,0.477987,0.471405,0.463877,0.471185,0.448175,0.487517
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
# ORIGINAL

# Convert each column to numeric
# Boolean columns are converted 0/1 (i.e. Gender, self_employed, family_history, treatment, Growing_stress, coping struggles)

df['Gender_bin'] = (df['Gender']=='Female').astype(int)
df['self_employed_bin'] = (df['self_employed']=='Yes').astype(int)
df['family_history_bin'] = np.where(df['family_history'] == 'Yes', 1, 0)
df['treatment_bin'] = np.where(df['treatment'] == 'Yes', 1, 0)
df['Coping_struggles_bin'] = np.where(df['Coping_Struggles'] == 'Yes', 1, 0)

df['Growing_Stress'].replace(['Yes'], 2, inplace=True)
df['Growing_Stress'].replace(['Maybe'], 1, inplace=True)
df['Growing_Stress'].replace(['No'], 0, inplace=True)
# from sklearn.datasets import make_multilabel_classification

df.drop(['Gender',"self_employed", "family_history", "treatment", "Coping_Struggles"], axis=1, inplace=True)

# One hot encode the other columns
def onehot(inputdf, colname):
    hot_encoded_data = pd.get_dummies(inputdf, columns = [colname])
    return hot_encoded_data

col_names = ['Occupation', 'Days_Indoors', 'Changes_Habits', 'Mental_Health_History', 'Mood_Swings', 'Work_Interest', 'Social_Weakness', 'care_options']
for col in col_names: 
    df = onehot(df, col)

df
# 168056 rows x 34 col


Unnamed: 0,Growing_Stress,Gender_bin,self_employed_bin,family_history_bin,treatment_bin,Coping_struggles_bin,Occupation_Business,Occupation_Corporate,Occupation_Housewife,Occupation_Others,...,Mood_Swings_Medium,Work_Interest_Maybe,Work_Interest_No,Work_Interest_Yes,Social_Weakness_Maybe,Social_Weakness_No,Social_Weakness_Yes,care_options_No,care_options_Not sure,care_options_Yes
3,2,1,0,1,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
4,2,1,0,1,1,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,1
7,2,1,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,1,1,0,0
8,2,1,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,1,0,1,0
9,2,1,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292358,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
292359,0,0,1,1,1,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
292361,0,0,0,1,0,1,1,0,0,0,...,0,0,1,0,1,0,0,1,0,0
292362,0,0,0,1,1,1,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1


In [5]:
# Final dataframe
df.describe()

Unnamed: 0,Gender_bin,self_employed_bin,family_history_bin,treatment_bin,Coping_struggles_bin,Occupation_Business,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student,...,Mood_Swings_Medium,Work_Interest_Maybe,Work_Interest_No,Work_Interest_Yes,Social_Weakness_Maybe,Social_Weakness_No,Social_Weakness_Yes,care_options_No,care_options_Not sure,care_options_Yes
count,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,...,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0,168056.0
mean,0.204015,0.056219,0.441365,0.541189,0.473925,0.171842,0.20827,0.226776,0.181255,0.211858,...,0.345266,0.345736,0.361743,0.292522,0.353269,0.333329,0.313401,0.332711,0.278324,0.388966
std,0.402981,0.230346,0.496551,0.498302,0.499321,0.377244,0.406072,0.418748,0.385231,0.408626,...,0.475456,0.475609,0.480506,0.454922,0.477987,0.471405,0.463877,0.471185,0.448175,0.487517
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
# save with date (don't accidentally rewrite files!!!)
df.to_csv('clean-data/multiclass.csv', index=False) 