## Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in labor force data
labor_force = pd.read_csv("Data/bls_labor_force_data.csv", header = 0, low_memory = False)

#change data types
labor_force["employed"] = labor_force["employed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["labor_force"] = labor_force["labor_force"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployed"] = labor_force["unemployed"].str.replace(",", "").str.replace("N.A.", "-1").astype("int64")
labor_force["unemployment_rate"] = labor_force["unemployment_rate"].str.replace("N.A.", "-1").astype("float")

In [3]:
#read in gdp data
gdp = pd.read_csv("Data/bea_gdp_data.csv", header = 0, encoding='latin-1')

#melt so that one year per row
ids = list(gdp.columns)[0:7]
values = list(gdp.columns)[7:]
gdp= gdp.melt(id_vars = ids,  value_vars = values, var_name = "year", value_name = "gdp_thousands")

#generate state and fips codes for merging
gdp["state_fips_code"] = gdp["GeoFIPS"] // 1000
gdp["county_fips_code"] = gdp["GeoFIPS"] % 1000

#remove irrelevant columns
del gdp["Description"]
del gdp["TableName"]
del gdp["LineCode"]
del gdp["Unit"]
del gdp["Region"]


gdp["year"] = gdp["year"].astype("int64")

In [4]:
#read in suicide data
cdc = pd.read_csv("Data/cdc_suicide_data.csv", header = 0, low_memory='False')

del cdc["Notes"]
del cdc["Year Code"]

cdc = cdc.rename(columns = {"Year":"year", "County Code":"GeoFIPS"})

Unnamed: 0,County,GeoFIPS,year,Deaths,Population,Crude Rate
0,"Autauga County, AL",1001,2001,11,44889,24.50489
1,"Baldwin County, AL",1003,2001,20,144875,13.805004
2,"Calhoun County, AL",1015,2001,18,111266,16.177449
3,"Etowah County, AL",1055,2001,21,102976,20.393101
4,"Jackson County, AL",1071,2001,10,53997,18.519547


In [5]:
# merge data frames
merged1 = pd.merge(labor_force, gdp, on = ["state_fips_code", "county_fips_code", "year"])
merged = pd.merge(merged1, cdc, on = ["GeoFIPS", "year"])

merged.head()


# # Look into what didnt merge
# lf = labor_force[labor_force["year"] >= 2001]
#only have bea data 2001 onwards

# merged["unique_id"] = merged["laus_code"] + merged["year"].astype("str")
# lf["unique_id"] = lf["laus_code"] + lf["year"].astype("str")
# gdp["unique_id0"] = gdp["GeoFIPS"].astype("str") + gdp["year"].astype("str")
# merged["unique_id0"] = merged["GeoFIPS"].astype("str") + merged["year"].astype("str")

# print(lf[~lf["unique_id"].isin(merged["unique_id"])]["county_name"].unique())
# # looks like we're missing all of puerto rico bc bea does not include pr data
# print(gdp[~gdp["unique_id0"].isin(merged["unique_id0"])]["GeoName"].unique())
# #on several occasions, multiple VA cities condensed into one line in bea which are left separate in bls... we should probably address this at some point
# otherwise, only state totals excluded (good)

Unnamed: 0,laus_code,state_fips_code,county_fips_code,county_name,year,labor_force,employed,unemployed,unemployment_rate,GeoFIPS,GeoName,gdp_thousands,County,Deaths,Population,Crude Rate
0,CN0100100000000,1,1,"Autauga County, AL",2018,25957,25015,942,3.6,1001,"Autauga, AL",1690937,"Autauga County, AL",11,55601,19.783817
1,CN0100300000000,1,3,"Baldwin County, AL",2018,93849,90456,3393,3.6,1003,"Baldwin, AL",6606080,"Baldwin County, AL",44,218022,20.18145
2,CN0100900000000,1,9,"Blount County, AL",2018,25006,24128,878,3.5,1009,"Blount, AL",942904,"Blount County, AL",10,57840,17.289073
3,CN0101500000000,1,15,"Calhoun County, AL",2018,45972,43833,2139,4.7,1015,"Calhoun, AL",4428092,"Calhoun County, AL",25,114277,21.876668
4,CN0103300000000,1,33,"Colbert County, AL",2018,23258,22184,1074,4.6,1033,"Colbert, AL",3066841,"Colbert County, AL",17,54762,31.043424


In [6]:
merged.to_csv("Data/merged.csv", index = False, header = True)