## In this file, we're going to clean the data of all participants who didn't answer each of the questions, and then we're going to create dummies to replace the survey encoded answers.  Then we'll strip our dataframe of the survey information, and save that new dataframe as our dataset to work with

In [14]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv('MH_DATA_UNCHANGED.csv')

# I'm going to start by keeping only rows that have answers for the questions I care about (the columns I've kept)

In [16]:
df_help = df[(df.AMDELT == 1) | (df.AMDELT == 2)]
df_new = df_help[(df_help.AUINPYR == 1) | (df_help.AUINPYR == 2)]
df_another = df_new[(df_new.AMDEYR == 1) | (df_new.AMDEYR == 2)]
df_relig = df_another[(df_another.SNRLGSVC == 1) | (df_another.SNRLGSVC == 2) | (df_another.SNRLGSVC == 3) | (df_another.SNRLGSVC == 4) | (df_another.SNRLGSVC == 5) | (df_another.SNRLGSVC == 6)]
df_relig2 = df_relig[(df_relig.SNRLDCSN == 1) | (df_relig.SNRLDCSN == 2) | (df_relig.SNRLDCSN == 3) | (df_relig.SNRLDCSN == 4)]
df_age = df_relig2[(df_relig2.AGE2 > 7)]
df_moves = df_age[(df_age.MOVSINPYR2 == 0) | (df_age.MOVSINPYR2 == 1) | (df_age.MOVSINPYR2 == 2) | (df_age.MOVSINPYR2 == 3)]
df_sexid = df_moves[(df_moves.SEXIDENT == 1) | (df_moves.SEXIDENT == 2) | (df_moves.SEXIDENT == 3)]
df_employ = df_sexid[(df_sexid.IRWRKSTAT == 1) | (df_sexid.IRWRKSTAT == 2) | (df_sexid.IRWRKSTAT == 3) | (df_sexid.IRWRKSTAT == 4)]
df_insure = df_employ[(df_employ.PRVHLTIN == 1) | (df_employ.PRVHLTIN == 2)]
df_sui = df_insure[(df_insure.MHSUITHK == 0) | (df_insure.MHSUITHK == 1)]
df_pov = df_sui[(df_sui.POVERTY3 == 1) | (df_sui.POVERTY3 == 2) | (df_sui.POVERTY3 == 3)]
df_mar = df_pov[(df_pov.IRMARIT == 1) | (df_pov.IRMARIT == 2) | (df_pov.IRMARIT == 3) | (df_pov.IRMARIT == 4)]
df_health = df_mar[(df_mar.HEALTH == 0) | (df_mar.HEALTH == 1) | (df_mar.HEALTH == 2) | (df_mar.HEALTH == 3) | (df_mar.HEALTH == 4) | (df_mar.HEALTH == 5)]
df_sex = df_health[(df_health.IRSEX == 1) | (df_health.IRSEX == 2)]
df_second = df_sex[(df_sex.AUOPTYR == 1) | (df_sex.AUOPTYR == 2)]

## Ok, so now we've thrown out all participants who don't have answers for the questions I want to evaluate (or they were assigned a 'skip' for some reason).  
#TODO - find a simpler way to do all that coding, but survey data is hard. 

## Now we'll start to decode their answers, and create dummies for this information.  Creating dummies takes a while, feel free to keep scrolling.

In [19]:
# This one says 'If they responded to the question AMDELT, then give them a 1 for DEPEV (Depressed Ever), else give them 
# a 0.' So, this is a dummy variable for whether someone has ever had a major depressive episode.
pd.options.mode.chained_assignment = None
df_second['DEPEV'] = np.where(df_second.AMDELT == 1, 1, 0) 

In [20]:
pd.options.mode.chained_assignment = None
df_second['DEPLY'] = np.where(df_second.AMDEYR == 1, 1, 0)

In [21]:
# Not treated = 0, treated = 1, where treated can be inpatient or outpatient programs.
pd.options.mode.chained_assignment = None
df_second['TREATEDLY'] = np.where(df_second.AUINPYR == 1, 1, 
                                 np.where(df_second.AUOPTYR == 1, 1, 0))

In [22]:
# Baseline no dummy religion is 1-24 services in past year
# NoReligserv: 1 never went, 0 everyone else
pd.options.mode.chained_assignment = None
df_second['NORELIGSERV'] = np.where(df_second.SNRLGSVC == 1, 1, 
                                 np.where(df_second.SNRLGSVC == 2, 0,
                                          np.where(df_second.SNRLGSVC == 3, 0,
                                                   np.where(df_second.SNRLGSVC == 4, 0,
                                                            np.where(df_second.SNRLGSVC == 5, 0,
                                                                     np.where(df_second.SNRLGSVC == 6, 0, 0))))))

In [23]:
# A Lot of Religserv: 1 went 25 - inf, 0 everyone else
pd.options.mode.chained_assignment = None
df_second['ALOTRELIGSERV'] = np.where(df_second.SNRLGSVC == 1, 0, 
                                 np.where(df_second.SNRLGSVC == 2, 0,
                                          np.where(df_second.SNRLGSVC == 3, 0,
                                                   np.where(df_second.SNRLGSVC == 4, 0,
                                                            np.where(df_second.SNRLGSVC == 5, 1,
                                                                     np.where(df_second.SNRLGSVC == 6, 1, 0))))))

In [24]:
# Religdec: 0 religiousness does not impact decisions, 1 it does
pd.options.mode.chained_assignment = None
df_second['RELIGDEC'] = np.where(df_second.SNRLDCSN == 1, 0, 
                                 np.where(df_second.SNRLDCSN == 2, 0,
                                          np.where(df_second.SNRLDCSN == 3, 1,
                                                   np.where(df_second.SNRLDCSN == 4, 1, 0))))

In [25]:
# baseline is straight
# 0 is straight + gay, 1 is bi
pd.options.mode.chained_assignment = None
df_second['BISEXUAL'] = np.where(df_second.SEXIDENT == 1, 0, 
                                 np.where(df_second.AUOPTYR == 3, 1, 
                                          np.where(df_second.AUOPTYR == 2, 0, 0)))

In [26]:
# 0 is straight + bi, 1 is gay
pd.options.mode.chained_assignment = None
df_second['GAY'] = np.where(df_second.SEXIDENT == 1, 0, 
                                 np.where(df_second.AUOPTYR == 3, 0, 
                                          np.where(df_second.AUOPTYR == 2, 1, 0)))

In [27]:
# 0 is working, 1 is not
pd.options.mode.chained_assignment = None
df_second['NTWORKING'] = np.where(df_second.IRWRKSTAT == 1, 0, 
                                 np.where(df_second.IRWRKSTAT == 2, 0, 
                                          np.where(df_second.IRWRKSTAT == 3, 1, 
                                                   np.where(df_second.IRWRKSTAT == 4, 1, 0))))

In [28]:
# 0 is not having private insurance, 1 is having private insurance
pd.options.mode.chained_assignment = None
df_second['PRVINSUR'] = np.where(df_second.PRVHLTIN == 2, 0, 
                                 np.where(df_second.PRVHLTIN == 1, 1, 0))

In [29]:
# Baseline is up to 2x fed poverty level
# 1 is in poverty
pd.options.mode.chained_assignment = None
df_second['POVERTY'] = np.where(df_second.POVERTY3 == 1, 1, 
                                 np.where(df_second.POVERTY3 == 2, 0, 
                                          np.where(df_second.POVERTY3 == 3, 0, 0)))

In [30]:
# 1 is income more than 2X Fed poverty level
pd.options.mode.chained_assignment = None
df_second['WEALTHY'] = np.where(df_second.POVERTY3 == 3, 1, 0)

In [31]:
# 1 is married
pd.options.mode.chained_assignment = None
df_second['MARRIED'] = np.where(df_second.IRMARIT == 1, 1, 0)

In [32]:
# 1 is separated
pd.options.mode.chained_assignment = None
df_second['SEPARATED'] = np.where(df_second.IRMARIT == 3, 1, 0)

In [33]:
# 1 is widowed
pd.options.mode.chained_assignment = None
df_second['WIDOWED'] = np.where(df_second.IRMARIT == 2, 1, 0)

In [34]:
# 1 is in poor health
pd.options.mode.chained_assignment = None
df_second['POORHEALTH'] = np.where(df_second.HEALTH == 4, 1,
                                   np.where(df_second.HEALTH == 5, 1, 0))

In [35]:
# Have moved once = 1
pd.options.mode.chained_assignment = None
df_second['MOVEDONCE'] = np.where(df_second.MOVSINPYR2 == 1, 1, 0)

In [36]:
# Have moved more than once = 1
pd.options.mode.chained_assignment = None
df_second['MOVEDMORE'] = np.where(df_second.MOVSINPYR2 == 2, 1,
                                  np.where(df_second.MOVSINPYR2 == 3, 1, 0))

In [37]:
# Ages 35-49 will be my baseline
# This will be for 18-25
pd.options.mode.chained_assignment = None
df_second['YOUNGADULT'] = np.where(df_second.AGE2 == 8, 1, 
                                 np.where(df_second.AGE2 == 9, 1,
                                          np.where(df_second.AGE2 == 10, 1, 
                                                   np.where(df_second.AGE2 == 11, 1,
                                                            np.where(df_second.AGE2 == 12, 1,
                                                                     np.where(df_second.AGE2 == 13, 0,
                                                                              np.where(df_second.AGE2 == 14, 0,
                                                                                       np.where(df_second.AGE2 == 15, 0,
                                                                                                np.where(df_second.AGE2 == 16, 0,
                                                                                                         np.where(df_second.AGE2 == 17, 0, 0))))))))))

In [38]:
# 26 - 34
pd.options.mode.chained_assignment = None
df_second['TWENTOTHIRT'] = np.where(df_second.AGE2 == 8, 0, 
                                 np.where(df_second.AGE2 == 9, 0,
                                          np.where(df_second.AGE2 == 10, 0, 
                                                   np.where(df_second.AGE2 == 11, 0,
                                                            np.where(df_second.AGE2 == 12, 0,
                                                                     np.where(df_second.AGE2 == 13, 1,
                                                                              np.where(df_second.AGE2 == 14, 1,
                                                                                       np.where(df_second.AGE2 == 15, 0,
                                                                                                np.where(df_second.AGE2 == 16, 0,
                                                                                                         np.where(df_second.AGE2 == 17, 0, 0))))))))))

In [39]:
# 50-64
pd.options.mode.chained_assignment = None
df_second['FIFTOSIX'] = np.where(df_second.AGE2 == 8, 0, 
                                 np.where(df_second.AGE2 == 9, 0,
                                          np.where(df_second.AGE2 == 10, 0, 
                                                   np.where(df_second.AGE2 == 11, 0,
                                                            np.where(df_second.AGE2 == 12, 0,
                                                                     np.where(df_second.AGE2 == 13, 0,
                                                                              np.where(df_second.AGE2 == 14, 0,
                                                                                       np.where(df_second.AGE2 == 15, 0,
                                                                                                np.where(df_second.AGE2 == 16, 1,
                                                                                                         np.where(df_second.AGE2 == 17, 0, 0))))))))))

In [40]:
# 65 and older
pd.options.mode.chained_assignment = None
df_second['OVERSIXFIV'] = np.where(df_second.AGE2 == 8, 0, 
                                 np.where(df_second.AGE2 == 9, 0,
                                          np.where(df_second.AGE2 == 10, 0, 
                                                   np.where(df_second.AGE2 == 11, 0,
                                                            np.where(df_second.AGE2 == 12, 0,
                                                                     np.where(df_second.AGE2 == 13, 0,
                                                                              np.where(df_second.AGE2 == 14, 0,
                                                                                       np.where(df_second.AGE2 == 15, 0,
                                                                                                np.where(df_second.AGE2 == 16, 0,
                                                                                                         np.where(df_second.AGE2 == 17, 1, 0))))))))))

In [41]:
# Women
pd.options.mode.chained_assignment = None
df_second['FEMALE'] = np.where(df_second.IRSEX == 2, 1, 0)

In [42]:
# Black/Afr Am
pd.options.mode.chained_assignment = None
df_second['BLACK'] = np.where(df_second.NEWRACE2 == 2, 1, 0)

In [43]:
# Native AM + Native Islander
pd.options.mode.chained_assignment = None
df_second['NATIVE_AM_IS'] = np.where(df_second.NEWRACE2 == 3, 1,
                                              np.where(df_second.NEWRACE2 == 4, 1,0))

In [44]:
# Asian
pd.options.mode.chained_assignment = None
df_second['ASIAN'] = np.where(df_second.NEWRACE2 == 5, 1, 0)

In [45]:
# Multiracial
pd.options.mode.chained_assignment = None
df_second['MULTIRACIAL'] = np.where(df_second.NEWRACE2 == 6, 1, 0)

In [46]:
# Hispanic
pd.options.mode.chained_assignment = None
df_second['HISPANIC'] = np.where(df_second.NEWRACE2 == 7, 1, 0)

## Now we've created dummies for all of the information we want, so we'll drop the original survey answer questions from our dataframe and save it!

In [47]:
df_second = df_second.drop(['AMDELT','AUINPYR','AUOPTYR','AMDEYR','SNRLGSVC','SNRLDCSN','SEXIDENT','IRWRKSTAT','PRVHLTIN', 'POVERTY3','IRMARIT','HEALTH', 'IRMARIT','MOVSINPYR2','AGE2','HLCNOTYR','IRSEX','NEWRACE2'], axis=1)

In [56]:
df_second.to_csv('MH_DATA_WVARS.csv', index=False, header=True)