In [24]:
# Import modules
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import os
import regex as re
import requests
import json
from bs4 import BeautifulSoup
import numpy as np

In [25]:
# open elections.xlsx
elections = pd.read_excel('elections.xlsx')
# open elections_with_geocodes.xlsx
geocodes = pd.read_excel('elections_with_geocodes.xlsx')
# open elections_with_tracts.xlsx
tracts = pd.read_excel('elections_with_tracts.xlsx')

groups = ['DP02', 'DP03', 'DP05']
key = 'ad8851f3bf6aaf76923ec4119b6f714cdfaa87d9'

In [26]:
tracts_dict = {'geocode': [], 'year': []}

# for each row in elections
for index, row in elections.iterrows():
    # if District # is not nan
    if not pd.isna(row['District #']):
        # get row in tracts where filename is the same
        tract_row = tracts[tracts['filename'] == row['filename']]
        # combine tract_list with tract_row['tracts']
        # make sure is not nan
        if not pd.isna(tract_row['tracts'].iloc[0]):
            new_tracts = tract_row['tracts'].iloc[0].split(',')
            # add to tracts_df
            for new_tract in new_tracts:
                tracts_dict['geocode'].append(new_tract)
                tracts_dict['year'].append(row['year'])
            
# create tracts_df
tracts_df = pd.DataFrame(tracts_dict)

# remove duplicates
tracts_df = tracts_df.drop_duplicates()

# reset index on tracts_df
tracts_df = tracts_df.reset_index(drop=True)

In [27]:
acs_data = []

In [28]:
# for each row in tracts_df
for index, row in tracts_df.iterrows():
    if index < len(acs_data):
        continue

    if index % 10 == 0:
        print(f"{index} of {len(tracts_df)}")

    geocode = row['geocode'].split('_')
    state = geocode[0].zfill(2)
    county = geocode[1].zfill(3)
    tract = geocode[2].zfill(6)
    year = row['year']
    
    data_dict = {'geocode': row['geocode'],
                 'year': year}
    
    if year <= 2005:
        adjusted_year = 2009
    elif year >= 2021:
        adjusted_year = 2022
    else:
        adjusted_year = year + 2

    for query_year in range(adjusted_year, adjusted_year - 3, -1):
        try: 
            for group in groups:
                url = f"https://api.census.gov/data/{query_year}/acs/acs5/profile?get=group({group})&for=tract:{tract}&in=state:{state}+county:{county}&key={key}"

                response = requests.get(url)
                data = json.loads(response.text)
                # append each array of data to each array of all_data
                for i in range(len(data[0])):
                    # do the same thing but for E, EA, PE, and PEA
                    if re.search(r'\d(?:E|PE)$', data[0][i]) is not None:
                        data_dict[data[0][i]] = data[1][i]
            break
                    
        except Exception as e:
            if query_year == adjusted_year -3:
                print(f"Error: {e}")
                exit()
            else:
                continue

    acs_data.append(data_dict)

0 of 6502
10 of 6502
20 of 6502
30 of 6502
40 of 6502
50 of 6502
60 of 6502
70 of 6502
80 of 6502
90 of 6502
100 of 6502
110 of 6502
120 of 6502
130 of 6502
140 of 6502
150 of 6502
160 of 6502
170 of 6502
180 of 6502
190 of 6502
200 of 6502
210 of 6502
220 of 6502
230 of 6502
240 of 6502
250 of 6502
260 of 6502
270 of 6502
280 of 6502
290 of 6502
300 of 6502
310 of 6502
320 of 6502
330 of 6502
340 of 6502
350 of 6502
360 of 6502
370 of 6502
380 of 6502
390 of 6502
400 of 6502
410 of 6502
420 of 6502
430 of 6502
440 of 6502
450 of 6502
460 of 6502
470 of 6502
480 of 6502
490 of 6502
500 of 6502
510 of 6502
520 of 6502
530 of 6502
540 of 6502
550 of 6502
560 of 6502
570 of 6502
580 of 6502
590 of 6502
600 of 6502
610 of 6502
620 of 6502
630 of 6502
640 of 6502
650 of 6502
660 of 6502
670 of 6502
680 of 6502
690 of 6502
700 of 6502
710 of 6502
720 of 6502
730 of 6502
740 of 6502
750 of 6502
760 of 6502
770 of 6502
780 of 6502
790 of 6502
800 of 6502
810 of 6502
820 of 6502
830 of 6502
840

In [30]:
columns = set().union(*(d.keys() for d in acs_data))

filled_dicts = [{key: d.get(key) for key in columns} for d in acs_data]

df = pd.DataFrame(filled_dicts)

# order so that geocode and year are first
df = df[['geocode', 'year'] + [col for col in df.columns if col not in ['geocode', 'year']]]

# save as csv
df.to_csv('acs_data.csv', index=False)