In [35]:
# Import modules
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from us import states
import os
import regex as re
import requests
import json
from bs4 import BeautifulSoup

In [84]:
# open key_rows_with_tracts.csv
key_rows_with_tracts = pd.read_excel(r'key_rows_with_tracts.xlsx')

In [86]:
key = 'd0a5018fadae7c974ffc88620aed6fd71d275fa6'

years = {}

# iterate over rows of key_rows_with_tracts
for index, row in key_rows_with_tracts.iterrows():
    year = int(row['year'])

    if year in years.keys():
        continue
    else:
        years[year] = 0

    if year >= 2009:
        set = 'acs5'

    elif year >= 2007:
        set = 'acs3'
    elif year == 2004:
        continue
    else:
        set = 'acs1'

    variable_url = f"https://api.census.gov/data/{year}/acs/{set}/profile/variables.html"

    response = requests.get(variable_url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table element by inspecting the HTML structure
    table = soup.find('table')

    # Extract data from the table
    data = []
    for row in table.find_all('tr'):
        cols = row.find_all(['td', 'th'])
        cols = [col.text.strip() for col in cols]
        # if the first element is Name or starts with DP02 or DP03 or DP04 AND it does not end in PE, make sure to evaluate if re.match returns None
        if cols and (cols[0] == 'Name' or re.match(r'DP02|DP03|DP05', cols[0]) is not None) and (re.search(r'\d(?:E|EA)$', cols[0]) is not None):
            data.append(cols)

    # Create a DataFrame from the extracted data
    variable_table = pd.DataFrame(data[1:], columns=data[0])

    print(len(variable_table))

    # save to variables folder as variables_{year}.csv
    # variable_table.to_csv(f'variables/variables_{set}_{year}.csv', index=False)

    years[year] = len(variable_table)


533
515
524
529
515
521
529
533
521
499
535
531
490
515
524
492
488


In [82]:
years

{2020: 1067,
 2012: 1031,
 2016: 1049,
 2017: 1059,
 2010: 1031,
 2014: 1043,
 2018: 1059,
 2021: 1067,
 2013: 1043,
 2009: 999,
 2022: 1071,
 2019: 1063,
 2007: 981,
 2011: 1031,
 2004: 0,
 2015: 1049,
 2008: 985,
 2006: 488}

In [39]:
variable_table

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group
0,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,Selected Social Characteristics in the United ...,not required,"DP02_0001M,\n DP02_0001MA,\n ...",0,(not a predicate),DP02
1,DP02_0002E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Selected Social Characteristics in the United ...,not required,"DP02_0002M,\n DP02_0002MA,\n ...",0,int,DP02
2,DP02_0003E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Selected Social Characteristics in the United ...,not required,"DP02_0003M,\n DP02_0003MA,\n ...",0,int,DP02
3,DP02_0004E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Selected Social Characteristics in the United ...,not required,"DP02_0004M,\n DP02_0004MA,\n ...",0,int,DP02
4,DP02_0005E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,Selected Social Characteristics in the United ...,not required,"DP02_0005M,\n DP02_0005MA,\n ...",0,int,DP02
...,...,...,...,...,...,...,...,...
484,DP05_0077E,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,ACS Demographic and Housing Estimates: 2006,not required,"DP05_0077EA,\n DP05_0077M,\n ...",0,int,DP05
485,DP05_0078E,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,ACS Demographic and Housing Estimates: 2006,not required,"DP05_0078EA,\n DP05_0078M,\n ...",0,int,DP05
486,DP05_0079E,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,ACS Demographic and Housing Estimates: 2006,not required,"DP05_0079EA,\n DP05_0079M,\n ...",0,int,DP05
487,DP05_0080E,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,ACS Demographic and Housing Estimates: 2006,not required,"DP05_0080EA,\n DP05_0080M,\n ...",0,int,DP05


In [79]:
# read in D:\Users\User\Downloads\RA Work\acs\mye_profile.xls
mye_profile = pd.read_excel(r'mye_profile.xlsx')

concept = ''
subconcept = ''

names = []
labels = []
concepts = []

# # iterate over rows of mye_profile
for index, row in mye_profile.iterrows():
    # if PROFLN is NaN, continue
    if pd.isna(row['PROFLN']):
        continue

    # if PROFLN is 0, set concept to value in column STUB
    if row['PROFLN'] == 0:
        concept = row['STUB']
        subconcept = ''

    # if PROFLN ends in 0.3, continue
    elif str(row['PROFLN']).endswith('.3'):
        continue

    # if PROFLN ends in 0.5, set subconcept to value in column STUB
    elif str(row['PROFLN']).endswith('.5'):
        subconcept = row['STUB']

    else:
        label = concept + '!!' + subconcept + ('!!' if len(subconcept) > 0 else '') + str(row['STUB'])

        # set name to TBLID + '_' + PROFLN but remove . from PROFLN if there is one and add leading 0s to PROFLN to make it 4 digits 
        name = row['TBLID'] + '_' + str(int(row['PROFLN'])).zfill(4)

        # add values to names, labels, and concepts
        names.append(name)
        labels.append(label)
        concepts.append(concept)

# convert lists to df variable_table
variable_table = pd.DataFrame({'Name': names, 'Label': labels, 'Concept': concepts})

# save variable_table as to variables folder as variables_acs5_2004.csv
# variable_table.to_csv('variables/variables_acs5_2004.csv', index=False)