## Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in labor force data
labor_force = pd.read_csv("Data/bls_labor_force_data.csv", header = 0, low_memory = False)

#change data types
labor_force["employed"] = labor_force["employed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["labor_force"] = labor_force["labor_force"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployed"] = labor_force["unemployed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployment_rate"] = labor_force["unemployment_rate"].str.replace("N.A.", "-1").astype("float")

In [3]:
#read in gdp data
gdp = pd.read_csv("Data/bea_gdp_data.csv", header = 0, encoding='latin-1')

#melt so that one year per row
ids = list(gdp.columns)[0:7]
values = list(gdp.columns)[7:]
gdp= gdp.melt(id_vars = ids,  value_vars = values, var_name = "year", value_name = "gdp_thousands")

#generate state and fips codes for merging
gdp["state_fips_code"] = gdp["GeoFIPS"] // 1000
gdp["county_fips_code"] = gdp["GeoFIPS"] % 1000

#remove irrelevant columns
del gdp["Description"]
del gdp["TableName"]
del gdp["LineCode"]
del gdp["Unit"]
del gdp["Region"]


gdp["year"] = gdp["year"].astype("int64")

In [4]:
#read in suicide data
cdc = pd.read_csv("Data/cdc_suicide_data.csv", header = 0, low_memory='False')

del cdc["Notes"]
del cdc["Year Code"]

cdc = cdc.rename(columns = {"Year":"year", "County Code":"GeoFIPS"})

In [5]:
# merge data frames
merged1 = pd.merge(labor_force, gdp, on = ["state_fips_code", "county_fips_code", "year"])
merged = pd.merge(merged1, cdc, on = ["GeoFIPS", "year"])


geoFIPSnew = merged["GeoFIPS"].astype("str")
for i in range(len(geoFIPSnew)):
    if len(geoFIPSnew[i]) < 5:
        geoFIPSnew[i] = "0" + geoFIPSnew[i]

merged["GeoFIPSnew"] = geoFIPSnew

In [6]:
merged.to_excel("Data/merged.xlsx", index = False, header = True)

In [7]:
merged.head()

# Look into what didnt merge
lf = labor_force[labor_force["year"] >= 2001]
#only have bea data 2001 onwards

print(len(gdp))
print(len(lf))
print(len(cdc))
print(len(merged))

# merged["unique_id"] = merged["laus_code"] + merged["year"].astype("str")
# lf["unique_id"] = lf["laus_code"] + lf["year"].astype("str")
# gdp["unique_id0"] = gdp["GeoFIPS"].astype("str") + gdp["year"].astype("str")
# merged["unique_id0"] = merged["GeoFIPS"].astype("str") + merged["year"].astype("str")

# print(lf[~lf["unique_id"].isin(merged["unique_id"])]["county_name"].unique())
# # looks like we're missing all of puerto rico bc bea does not include pr data
# print(gdp[~gdp["unique_id0"].isin(merged["unique_id0"])]["GeoName"].unique())
# #on several occasions, multiple VA cities condensed into one line in bea which are left separate in bls... we should probably address this at some point
# #otherwise, only state totals excluded (good)

57168
57924
15031
14815
