In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_digits, make_classification
from itertools import combinations
import ast

## Data from 2017

In [300]:
mental2017 = pd.read_csv("mental_health/survey_2017.csv")


In [301]:
mental2017.shape

(756, 123)

In [302]:
print(len(mental2017.columns))
mental2017.shape

123


(756, 123)

In [303]:
import csv
csv_file = '2017columns.csv'
my_list = mental2017.columns.to_list()
df = pd.DataFrame(my_list, columns=['Column_Name'])

# Export the DataFrame to a CSV file with the custom separator (e.g., semicolon)
df.to_csv('my_list.csv', sep=';', index=False)


In [304]:
column_names_2017 = ['Age', 'Gender', 'Country', 'family_history', 'treatment', 'no_employees', 'benefits', 'care_options', 'wellness_program',
 'anonymity', 'leave', 'phys_health_consequence', 'coworkers', 'supervisor', 'phys_health_interview', 'work_treatment_interfere', 'work_no_treatment_interfere']

In [305]:
dict_2017 = {'What is your age?': 'Age',
 'What is your gender?': 'Gender',
 'Do you have a family history of mental illness?': 'family_history',
 'How many employees does your company or organization have?': 'no_employees',
 'Does your employer provide mental health benefits\xa0as part of healthcare coverage?': 'benefits',
 'Do you know the options for mental health care available under your employer-provided health coverage?': 'care_options',
 'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': 'wellness_program',
 'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?': 'anonymity',
 'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?': 'leave',
 'Would you feel comfortable discussing a mental health issue with your coworkers?': 'coworkers',
 'Would you have been willing to discuss your mental health with your direct supervisor(s)?': 'supervisor',
 'Would you be willing to bring up a physical health issue with a potential employer in an interview?': 'phys_health_interview',
 'If you have a mental health disorder, how often do you feel that it interferes with your work <strong>when being treated effectively?</strong>': 'work_treatment_interfere',
 'If you have a mental health disorder, how often do you feel that it interferes with your work <strong>when <em>NOT</em> being treated effectively (i.e., when you are experiencing symptoms)?</strong>': 'work_no_treatment_interfere',
 'What country do you <strong>live</strong> in?': 'Continent',
 'Have you ever sought treatment for a mental health disorder from a mental health professional?': 'treatment'}


In [306]:
dict_2017

{'What is your age?': 'Age',
 'What is your gender?': 'Gender',
 'Do you have a family history of mental illness?': 'family_history',
 'How many employees does your company or organization have?': 'no_employees',
 'Does your employer provide mental health benefits\xa0as part of healthcare coverage?': 'benefits',
 'Do you know the options for mental health care available under your employer-provided health coverage?': 'care_options',
 'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': 'wellness_program',
 'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?': 'anonymity',
 'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?': 'leave',
 'Would you feel comfortable discussing a mental health issue with your coworkers?': 'coworkers',


In [307]:
mental2017 = mental2017[dict_2017.keys()].rename(columns = dict_2017)
# test.rename(columns = dict)

In [308]:
mental2017.isna().sum()

Age                              2
Gender                          14
family_history                   0
no_employees                   113
benefits                       113
care_options                   180
wellness_program               113
anonymity                      113
leave                          113
coworkers                      113
supervisor                      89
phys_health_interview            0
work_treatment_interfere         0
work_no_treatment_interfere      0
Continent                        2
treatment                        0
dtype: int64

In [309]:
#care_options fill NA
mental2017.care_options.fillna('Not sure', inplace=True)

In [310]:
# Gender and supervisor fill NA 
mental2017.supervisor.fillna("I don't know", inplace = True)
mental2017.Gender.fillna('other', inplace= True)
mental2017.isna().sum()

Age                              2
Gender                           0
family_history                   0
no_employees                   113
benefits                       113
care_options                     0
wellness_program               113
anonymity                      113
leave                          113
coworkers                      113
supervisor                       0
phys_health_interview            0
work_treatment_interfere         0
work_no_treatment_interfere      0
Continent                        2
treatment                        0
dtype: int64

In [311]:
# benefits drop NA
mental2017.dropna(subset=['benefits'], inplace= True)

In [312]:
mental2017.isna().sum()

Age                            0
Gender                         0
family_history                 0
no_employees                   0
benefits                       0
care_options                   0
wellness_program               0
anonymity                      0
leave                          0
coworkers                      0
supervisor                     0
phys_health_interview          0
work_treatment_interfere       0
work_no_treatment_interfere    0
Continent                      0
treatment                      0
dtype: int64

In [313]:
old_mental = pd.read_csv('14_16_pre_Encoder.csv')
old_mental = old_mental[mental2017.columns]

In [314]:
# for i in old_mental.columns:
#     print(i)
#     print(old_mental[i].unique())
#     print(20*'=')
#     print(mental2017[i].unique())
#     input()

In [315]:
# test = mental2017.copy()
mental2017 = test.copy()
# Gender
mental2017.Gender = mental2017.Gender.str.lower()
# other
trans_gender_list = ['Genderfluid', 'Nonbinary', 'other','uhhhhhhhhh fem genderqueer?', 'God King of the Valajar','Non-binary',
                     'Agender/genderfluid','sometimes', 'Contextual', 'contextual', 'Non binary', 'Genderqueer demigirl',
 'Genderqueer/non-binary', 'nonbinary', '\\-', 'Transfeminine', 'non binary']

for g in trans_gender_list:
    mental2017.loc[mental2017['Gender'].str.contains(g), 'Gender'] = 'other'
# female
female_gender_list = ['Female', 'F', 'female', 'f', 'Female ', 'Woman', 'femalw', 'femail', 'female (cis)', 'My sex is female.', 'woman', 
                       'female (cisgender)', 'Female (cis) ', 'cis-Female', 'cis female', 'F', 'cisgender']

for g in female_gender_list:
    mental2017.loc[mental2017['Gender'].str.contains(g), 'Gender'] = 'f'

#male
male_gender_list = ['male', 'Male', 'M', 'Man', 'cis-male', 'Mail', 'cis hetero male', 'm', 'Male (cis)', 'Cis male', 'man', 'cis male ',
                     'Male-ish', 'Male ', 'Cis-male', 'Cis Male', 'dude']

for g in male_gender_list:
    mental2017.loc[mental2017['Gender'].str.contains(g), 'Gender'] = 'm'

mental2017.Gender.unique()

  mental2017.loc[mental2017['Gender'].str.contains(g), 'Gender'] = 'f'
  mental2017.loc[mental2017['Gender'].str.contains(g), 'Gender'] = 'm'


array(['f', 'm', 'other'], dtype=object)

In [316]:
# benefits
mental2017.loc[mental2017['benefits'].str.contains("I don't know"), 'benefits'] = "Don't know"
mental2017.loc[mental2017['benefits'].str.contains('Not eligible for coverage / NA'), 'benefits'] = 'Not eligible for coverage / N/A'
# wellness_program
mental2017.loc[mental2017['wellness_program'].str.contains("I don't know"), 'wellness_program'] = "Don't know"

# anonymity
mental2017.loc[mental2017['anonymity'].str.contains("I don't know"), 'anonymity'] = "Don't know"

# leave
mental2017.loc[mental2017['leave'].str.contains("I don't know"), 'leave'] = "Don't know"
mental2017.loc[mental2017['leave'].str.contains("Difficult"), 'leave'] = "Very difficult"

# coworkers
mental2017.loc[mental2017['coworkers'].str.contains("I don't know"), 'coworkers'] = "Don't know"

# supervisor
d_sup ={'Yes, all of my previous supervisors' : 'Yes', 'No, none of my previous supervisors' : 'No', 'Some of my previous supervisors' : 'Some of them',
"I don't know" : 'Maybe'}
for i in d_sup.keys():
        mental2017.loc[mental2017['supervisor'].str.contains(i), 'supervisor'] = d_sup[i]


In [317]:
# Country
mental2017.Continent.replace({"United States of America":"United States"}, inplace= True)

In [318]:
continents = pd.read_csv('country_cont.csv', header=None)
continents.columns = ['Continent', 'Continent_real']

In [319]:
mental2017 = pd.merge(mental2017, continents, on='Continent', how='left')

In [320]:
for i in mental2017.Continent.unique().tolist():
    if i not in continents.Continent.tolist():
        print(i)

United States


In [321]:
# Fix the United States
mental2017.loc[mental2017.Continent == 'United States', 'Continent_real'] = 'North America'

In [322]:
mental2017.Continent = mental2017.Continent_real
mental2017.drop('Continent_real', axis=1, inplace= True)
mental2017

Unnamed: 0,Age,Gender,family_history,no_employees,benefits,care_options,wellness_program,anonymity,leave,coworkers,supervisor,phys_health_interview,work_treatment_interfere,work_no_treatment_interfere,Continent,treatment
0,27.0,f,No,100-500,No,Yes,No,Don't know,Don't know,Yes,Yes,Yes,Sometimes,Sometimes,Europe,1
1,31.0,m,No,100-500,Yes,Yes,No,Don't know,Don't know,Yes,No,Yes,Not applicable to me,Sometimes,Europe,0
2,36.0,m,Yes,6-25,Don't know,No,Don't know,Yes,Very difficult,Maybe,No,Maybe,Sometimes,Sometimes,North America,1
3,22.0,m,I don't know,More than 1000,Yes,Yes,Don't know,Yes,Very difficult,Yes,Maybe,No,Sometimes,Often,North America,1
4,30.0,m,Yes,100-500,Yes,No,No,Yes,Somewhat easy,Maybe,Some of them,Maybe,Rarely,Not applicable to me,North America,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
638,29.0,m,Yes,26-100,Yes,Yes,No,Yes,Somewhat easy,Maybe,Maybe,Yes,Rarely,Often,North America,1
639,38.0,m,No,500-1000,Don't know,No,No,Don't know,Very difficult,Maybe,No,Yes,Rarely,Often,North America,1
640,41.0,m,No,26-100,Don't know,No,No,Yes,Very easy,Yes,Some of them,Maybe,Not applicable to me,Not applicable to me,Europe,1
641,40.0,m,No,6-25,Yes,No,Don't know,Yes,Very easy,Yes,Yes,Yes,Sometimes,Often,North America,1


In [323]:
mental2017.to_csv('17_pre_Encoder.csv', index= False)