In [1]:
# Import modules
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from us import states
import os
import regex as re
import requests
import json
from bs4 import BeautifulSoup

In [2]:
# read in fairvote_with_acs_020224.xlsx
df = pd.read_excel('fairvote_with_acs_020224.xls')

df.head()

Unnamed: 0,year_election,filename,office,general_elec,partisan,eh_partisan,level,type,candidates,choices,...,per_asian,per_hispanic,per_ba,per_samehouse,per_forborn,lnpop,AK,AL,AM,local
0,2004,SanFrancisco_11022004_BoardofSupervisorsDistri...,City council,1,0,0,1,B,8,3,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1
1,2004,SanFrancisco_11022004_BoardofSupervisorsDistri...,City council,1,0,0,1,B,5,3,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1
2,2004,SanFrancisco_11022004_BoardofSupervisorsDistri...,City council,1,0,0,1,B,4,3,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1
3,2004,SanFrancisco_11022004_BoardofSupervisorsDistri...,City council,1,0,0,1,B,22,3,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1
4,2004,SanFrancisco_11022004_BoardofSupervisorsDistri...,City council,1,0,0,1,B,13,3,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1


In [3]:
df['location'] = ''
df['geo_type'] = ''

state_names = [x.name for x in states.STATES]

for index, row in df.iterrows():
    filename = row['filename'][:-4]

    location, date, election = filename.split('_', 2)

    # with the exception of the first character, put spaces before capital letters in location
    location = re.sub(r'(?<!^)(?=[A-Z])', ' ', location)

    geo_type = 'city'

    # check if location is a state name, accessed with state.name
    if location in state_names:
        geo_type = 'state'

    if 'county' in election.lower():
        geo_type = 'county'

    try:
        if 'BoroughPresident' in election: 
            # set location equal to "{location} {everything after Borough President in election} Borough"
            location = f"{location} {election.split('BoroughPresident')[1]} Borough"
            geo_type = 'borough'
        elif 'Ward' in election:
            # use regex to find numbers after ward
            ward = re.findall(r'Ward(\d+)', election)
            # set location equal to "{location} Ward {any numbers after ward in election and only numbers}"
            location = f"{location} Ward {ward[0]}"
            geo_type = 'city ward'
        # if there is a D followed by a number
        elif re.search(r'CD\d', election):
            # use regex to find numbers after ward
            cd = re.findall(r'CD(\d+)', election)
            # set location equal to "{location} Ward {any numbers after district in election and only numbers}"
            location = f"{location} District {', '.join(cd)}"
            geo_type =  'congressional district'
        elif re.search(r'D\d', election):
            # use regex to find numbers after ward
            d = re.findall(r'D(\d+)', election)
            # set location equal to "{location} Ward {any numbers after district in election and only numbers}"
            location = f"{location} District {d[0]}"
            geo_type =  geo_type + ' district'
        elif 'district' in election.lower():
            geo_type =  geo_type + ' district'
            #use regex to find any numbers or hyphens in election, not necessarily contiguous
            district = re.findall(r'([-0-9]+)', election.lower())
            # if number found, set location equal to "{location} District {any numbers in election and only numbers}"
            if district:
                if 'housedistrict' in election.lower():
                    geo_type = 'house district'
                location = f"{location} District {''.join(district)}"
            else:
                # look for first occurence of word Central, East, South, West, or North in election
                direction = re.findall(r'(Central|East|South|West|North)', election) 
                # if found, set location equal to "{location} {direction} District"
                if direction:
                    location = f"{location} {direction[0]} District"
                else:
                    # if no number found, set location equal to "{location} District {letter immediately following 'istrict'}"
                    district = re.findall(r'istrict(\w)', election)
                    location = f"{location} District {district[0]}"
                    geo_type = 'senate district'
        elif 'Dist' in election:
            # use regex to find numbers after Dist
            dist = re.findall(r'Dist(\d+)', election)
            # set location equal to "{location} Ward {any numbers after district in election and only numbers}"
            location = f"{location} District {dist[0]}"
            geo_type =  geo_type + ' district'
    except:
        print(election)
        print

    # set location column to location
    df.at[index, 'location'] = location
    df.at[index, 'geo_type'] = geo_type

# make 'location' and 'geo_type' columns the third and fourth columns
cols = df.columns.tolist()
cols = cols[:2] + [cols[-2]] + [cols[-1]] + cols[2:-2]
df = df[cols]


In [4]:
# shuffle df rows
df = df.sample(frac=1).reset_index(drop=True)

df.head(10)

Unnamed: 0,year_election,filename,location,geo_type,office,general_elec,partisan,eh_partisan,level,type,...,per_asian,per_hispanic,per_ba,per_samehouse,per_forborn,lnpop,AK,AL,AM,local
0,2012,Oakland_11062012_SchoolDirectorDistrict7.csv,Oakland District 7,city district,School superintendant,1,0,0,1,B,...,16.700001,25.4,37.9,83.8,27.5,12.666471,1,0,0,1
1,2014,SanLeandro_11042014_CountyCouncilDistrict1.csv,San Leandro District 1,county district,County board of supervisors,1,0,0,1,R,...,32.200001,28.0,27.7,88.1,35.4,11.174498,1,0,0,1
2,2016,Oakland_11082016_SchoolDirectorDistrict5.csv,Oakland District 5,city district,School superintendant,1,0,0,1,B,...,16.0,26.700001,39.7,84.5,27.3,12.726639,1,0,0,1
3,2021,NewYorkCity_06222021_REPCouncilMember50thCounc...,New York City District 50,city district,City council,0,1,RP,1,R,...,14.2,28.9,39.6,90.1,36.3,15.776323,1,1,0,1
4,2022,TakomaPark_11082022_CityCouncilWard3.csv,Takoma Park Ward 3,city ward,City council,1,0,0,1,R,...,6.0,12.2,60.6,86.6,30.4,9.504576,1,0,0,1
5,2018,SanFrancisco_11062018_AssessorRecorder.csv,San Francisco,city,City assessor,1,0,0,1,O,...,34.5,27.299999,30.0,89.8,36.2,11.2134,1,0,0,1
6,2008,SanFrancisco_11042008_BoardofSupervisorsDistri...,San Francisco District 7,city district,City council,1,0,0,1,B,...,33.5,14.7,51.2,84.0,35.6,13.451013,1,0,0,1
7,2021,Minneapolis_11022021_CityCouncilWard9.csv,Minneapolis Ward 9,city ward,City council,1,0,0,1,R,...,5.7,9.8,52.6,77.3,14.8,12.762259,1,0,0,1
8,2016,SanLeandro_11082016_CountyCouncilDistrict2.csv,San Leandro District 2,county district,County board of supervisors,1,0,0,1,R,...,33.900002,15.3,54.8,85.4,34.9,13.523599,1,0,0,1
9,2015,SanFrancisco_11032015_Mayor.csv,San Francisco,city,Mayor,1,0,0,1,E,...,33.799999,15.3,53.8,84.8,35.1,13.513576,1,0,0,1


In [5]:
key = 'd0a5018fadae7c974ffc88620aed6fd71d275fa6'

year = 2010

variable_url = f"https://api.census.gov/data/{year}/acs/acs5/variables.html"

response = requests.get(variable_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table element by inspecting the HTML structure
table = soup.find('table')

# Extract data from the table
data = []
for row in table.find_all('tr'):
    cols = row.find_all(['td', 'th'])
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Create a DataFrame from the extracted data
variable_table = pd.DataFrame(data[1:], columns=data[0])

# delete first row from variable_table
variable_table = variable_table.iloc[1:]

variable_table.head()

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group
1,AIANHH,American Indian Area/Alaska Native Area/Hawaii...,,not required,,0,(not a predicate),
2,ANRC,Alaska Native Regional Corporation,,not required,,0,(not a predicate),
3,B00001_001E,Estimate!!Total,,not required,B00001_001EA,0,(not a predicate),B00001
4,B00002_001E,Estimate!!Total,,not required,B00002_001EA,0,int,B00002
5,B01001_001E,Estimate!!Total,,not required,"B01001_001EA,\n B01001_001M,\n ...",0,int,B01001


In [16]:
variables = []
geo_code = []

query_url = f"https://api.census.gov/data/{year}/acs/acs1?get={','.join(variables)}&for={geo_code}&key={key}"

response = requests.get(query_url)