-
Notifications
You must be signed in to change notification settings - Fork 45
/
get_data.py
65 lines (51 loc) · 3.5 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import urllib
import urllib.request
import os
import zipfile
from urllib.request import Request, urlopen
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
def download_url(url, save_path):
with urllib.request.urlopen(url) as dl_file:
with open(save_path, 'wb') as out_file:
out_file.write(dl_file.read())
def create_votes_dataframes():
Path("data").mkdir(parents=True, exist_ok=True)
download_url("https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_county_20m.zip", "data/shapefile")
with zipfile.ZipFile("data/shapefile", 'r') as zip_ref:
zip_ref.extractall("data/counties")
shp = gpd.read_file("data/counties/cb_2019_us_county_20m.shp")
shp = shp[~shp["STATEFP"].isin(["02", "69", "66", "78", "60", "72", "15"])]
shp
df_election = pd.read_csv("https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2020_US_County_Level_Presidential_Results.csv")
df_election.head(2)
population = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48747/PopulationEstimates.xls?v=290.4", header = 2).rename(columns = {"FIPStxt": "FIPS"})
education = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48747/Education.xls?v=290.4", header = 4).rename(columns = {"FIPS Code": "FIPS", "Area name": "Area_Name"})
unemployment = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48747/Unemployment.xls?v=290.4", header = 4).rename(columns = {"fips_txt": "FIPS", "area_name": "Area_Name", "Stabr": "State"})
income = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48747/PovertyEstimates.xls?v=290.4", header = 4).rename(columns = {"FIPStxt": "FIPS", "Stabr": "State", "Area_name": "Area_Name"})
dfs = [df.set_index(['FIPS', 'State']) for df in [population, education, unemployment, income]]
data_county = pd.concat(dfs, axis=1)
df_election = df_election.merge(data_county.reset_index(), left_on = "county_fips", right_on = "FIPS")
df_election['county_fips'] = df_election['county_fips'].astype(str).str.lstrip('0')
shp['FIPS'] = shp['GEOID'].astype(str).str.lstrip('0')
votes = shp.merge(df_election, left_on = "FIPS", right_on = "county_fips")
req = Request('https://dataverse.harvard.edu/api/access/datafile/3641280?gbrecs=false')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)
df_historical = pd.read_csv(content, sep = "\t")
#df_historical = pd.read_csv('https://dataverse.harvard.edu/api/access/datafile/3641280?gbrecs=false', sep = "\t")
df_historical = df_historical.dropna(subset = ["FIPS"])
df_historical["FIPS"] = df_historical["FIPS"].astype(int)
df_historical['share'] = df_historical['candidatevotes']/df_historical['totalvotes']
df_historical = df_historical[["year", "FIPS", "party", "candidatevotes", "share"]]
df_historical['party'] = df_historical['party'].fillna("other")
df_historical_wide = df_historical.pivot_table(index = "FIPS", values=['candidatevotes',"share"], columns = ["year","party"])
df_historical_wide.columns = ["_".join(map(str, s)) for s in df_historical_wide.columns.values]
df_historical_wide = df_historical_wide.reset_index()
df_historical_wide['FIPS'] = df_historical_wide['FIPS'].astype(str).str.lstrip('0')
votes['FIPS'] = votes['GEOID'].astype(str).str.lstrip('0')
votes = votes.merge(df_historical_wide, on = "FIPS")
votes["winner"] = np.where(votes['votes_gop'] > votes['votes_dem'], 'republican', 'democrats')
return votes