## Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in labor force data
labor_force = pd.read_csv("Data/bls_labor_force_data.csv", header = 0, low_memory = False)

#change data types
labor_force["employed"] = labor_force["employed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["labor_force"] = labor_force["labor_force"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployed"] = labor_force["unemployed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployment_rate"] = labor_force["unemployment_rate"].str.replace("N.A.", "-1").astype("float")

In [3]:
#read in gdp data
gdp = pd.read_csv("Data/bea_gdp_data.csv", header = 0, encoding='latin-1')

#melt so that one year per row
ids = list(gdp.columns)[0:7]
values = list(gdp.columns)[7:]
gdp= gdp.melt(id_vars = ids,  value_vars = values, var_name = "year", value_name = "gdp_thousands")

#generate state and fips codes for merging
gdp["state_fips_code"] = gdp["GeoFIPS"] // 1000
gdp["county_fips_code"] = gdp["GeoFIPS"] % 1000

#remove irrelevant columns
del gdp["Description"]
del gdp["TableName"]
del gdp["LineCode"]
del gdp["Unit"]
del gdp["Region"]


gdp["year"] = gdp["year"].astype("int64")

In [4]:
#read in suicide data
cdc = pd.read_csv("Data/cdc_suicide_data.csv", header = 0, low_memory='False')

del cdc["Notes"]
del cdc["Year Code"]

cdc = cdc.rename(columns = {"Year":"year", "County Code":"GeoFIPS"})

In [5]:
#read in census Data
census = pd.read_csv("Data/census_demos.csv", header = 0, encoding='latin-1')

In [6]:
#clean census Data
census = census[census["YEAR"] == 11]
census = census[census["AGEGRP"] == 0]
census['GeoFIPS'] = census["STATE"] * 1000 + census["COUNTY"]
census['pct_white'] = 100 * (census["WA_MALE"] + census["WA_FEMALE"]) / census["TOT_POP"]
census['pct_nonwhite'] = 100 - census['pct_white']
census = census.iloc[:, np.r_[0:4,11:14]]

In [7]:
#Read in and clean mhhi data
mhhi = pd.read_csv("Data/mhhi.csv", header = 0, encoding='latin-1')
mhhi = mhhi.rename(columns = {"Median_Household_Income_2018":"mhhi_2018"})
mhhi["mhhi_2018"] = mhhi["mhhi_2018"].str.replace("$", "").str.replace(",","").astype(float)

In [8]:
poverty = pd.read_csv("Data/poverty.csv", header = 0, encoding='latin-1')
poverty = poverty.rename(columns = {"PCTPOVALL_2018":"poverty_2018"})
poverty["poverty_2018"] = poverty["poverty_2018"].astype(float) 

In [9]:
smha = pd.read_csv("Data/smha.csv", header = 0, encoding='latin-1')
smha["smha_expenditures"] = smha["smha_expenditures"].str.replace("$", "").astype(float)


In [2]:
# merge data frames
merged1 = pd.merge(labor_force, gdp, on = ["state_fips_code", "county_fips_code", "year"])
merged2 = pd.merge(merged1, census, on = ["GeoFIPS"])
merged3 = pd.merge(merged2, mhhi, on = ["GeoFIPS"])
merged4 = pd.merge(merged3, poverty, on = ["GeoFIPS"])
merged5 = pd.merge(merged4, smha, on = ["STNAME"])
merged = pd.merge(merged5, cdc, on = ["GeoFIPS", "year"])


geoFIPSnew = merged["GeoFIPS"].astype("str")
for i in range(len(geoFIPSnew)):
    if len(geoFIPSnew[i]) < 5:
        geoFIPSnew[i] = "0" + geoFIPSnew[i]

merged["GeoFIPSnew"] = geoFIPSnew
merged["gdp_per_capita"] = merged["gdp_thousands"].astype(int) * 1000 / merged["Population"]
# merged.head()

NameError: name 'pd' is not defined

In [11]:
merged.to_excel("Data/merged.xlsx", index = False, header = True)

In [12]:
# merged
# # Look into what didnt merge
# lf = labor_force[labor_force["year"] >= 2001]
# #only have bea data 2001 onwards

# print(len(gdp))
# print(len(lf))
# print(len(cdc))
# print(len(merged))

# merged["unique_id"] = merged["laus_code"] + merged["year"].astype("str")
# lf["unique_id"] = lf["laus_code"] + lf["year"].astype("str")
# gdp["unique_id0"] = gdp["GeoFIPS"].astype("str") + gdp["year"].astype("str")
# merged["unique_id0"] = merged["GeoFIPS"].astype("str") + merged["year"].astype("str")

# print(lf[~lf["unique_id"].isin(merged["unique_id"])]["county_name"].unique())
# # looks like we're missing all of puerto rico bc bea does not include pr data
# print(gdp[~gdp["unique_id0"].isin(merged["unique_id0"])]["GeoName"].unique())
# #on several occasions, multiple VA cities condensed into one line in bea which are left separate in bls... we should probably address this at some point
# #otherwise, only state totals excluded (good)

\begin{table}[]
\begin{tabular}{llllllll}
GeoFIPS & GeoName     & year & unemployment\_rate & pct\_white  & mhhi\_2018 & poverty\_2018 & smha\_expenditures \\
1001    & Autauga, AL & 2018 & 3.6                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2017 & 3.9                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2016 & 5.1                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2013 & 6.2                & 76.72523875 & 59338      & 13.8          & 76.27             
\end{tabular}
\end{table}

In [4]:
from IPython.display import Latex
Latex(\begin{table}[]
\begin{tabular}{llllllll}
GeoFIPS & GeoName     & year & unemployment\_rate & pct\_white  & mhhi\_2018 & poverty\_2018 & smha\_expenditures \\
1001    & Autauga, AL & 2018 & 3.6                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2017 & 3.9                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2016 & 5.1                & 76.72523875 & 59338      & 13.8          & 76.27              \\
1001    & Autauga, AL & 2013 & 6.2                & 76.72523875 & 59338      & 13.8          & 76.27             
\end{tabular}
\end{table})


SyntaxError: unexpected character after line continuation character (<ipython-input-4-98efd34c08a6>, line 2)