This notebook contains dictionaries for each of the Census Beaurou's products. Each dictionary runs independently by model for better accuracy

In [0]:

## AMERICAN COMMUNITY SURVEY
# Terminology gathered from https://www.census.gov/acs/www/data_documentation/product_descriptions/
acs_terms = {
# Main tools for data use from the ACS 
'American Community Survey',
'ACS',
'My Congressional District',
'My Tribal Area',
'OnTheMao for Emergency Management',
'Census Business Builder',
'Narrative Profiles',
'Data Profiles',
'Comparison Profiles',
' TIGER/Line Shapefiles with Selected Demographic and Economic Data',
'ACS Public Use Microdata Sample',

# ACS Subject Tables (https://data.census.gov/table?d=ACS+1-Year+Estimates+Subject+Tables)
'S0101', 'S0102','S0102PR','S0103','S0103PR','S0501','S0502','S0502PR','S0503','S0504','S0505','S0506','S0601','S0601PR','S0701','S0701PR','S0702','S0801','S0801PR','S0802','S0804','S0902','S1001','S1002','S1101','S1201','S1251','S1301','S1401','S1501','S1502','S1601','S1602','S1603','S1701','S1702','S1703','S1810','S1811','S1901','S1902','S1903','S2001','S2002','S2101','S2201','2301','S2302','S2303','S2401','S2402','S2403','S2404','S2405','S2406','S2407','S2408','S2409','S2410','S2411','S2412','S2413','S2414','S2418','S2419','S2501','S2502','S2503','S2504','S2505','S2506','S2507','S2601A','S2601C','S2601CPR','S2602','S2603','S2701','S2702','S2702PR','S2703','S2704','S2802','S2901','S2713','S2601B'

}

In [0]:
# testing a different approach for less manual inputs. 

acs_terms_2 = {
    # High-signal names/brands
    "acs_phrase": r"(?i)\bamerican\s+community\s+survey\b|\bACS\b(?!\w)",

    # Explicit Census/ACS URLs (works on WET too)
    "acs_links": r"(?i)\b(?:https?://)?(?:www\.)?(?:data\.)?census\.gov\S*|\bwww2\.census\.gov\S*|\bapi\.census\.gov\S*",

    # Subject/Profile/Comparison table IDs (S/DP/CP) with optional PR suffix
    # Sxxxx or SxxxxPR ; DP02/DP03/DP04… ; CP02/CP03…
    "acs_subject_tables": r"(?i)\bS\d{4}(?:PR)?\b|\bDP0?\d{2,3}\b|\bCP0?\d{2,3}\b",

    # Detailed tables: Bxxxxx or Cxxxxx (+ optional PR, E, M, MOE suffixes)
    # Examples: B01001, C27010, B23025PR, B19013E, B19013M, B19013_MOE
    "acs_detail_tables": r"(?i)\b[BC]\d{5}(?:PR)?(?:[A-Z]{1,3}|_?MOE)?\b",

    # Any literal "table <id>" phrasing (very common in citations)
    "acs_table_word": r"(?i)\btable(?:s)?\s*(?:S\d{4}(?:PR)?|DP0?\d{2,3}|CP0?\d{2,3}|[BC]\d{5}(?:PR)?(?:[A-Z]{1,3}|_?MOE)?)\b",

    # MOE mention
    "acs_near_moe": r"(?i)\b(MOE|margin\s+of\s+error)\b",

    # Common citation verbs/markers
    "acs_cite_verbs": r"(?i)\b(source|according\s+to|data\s+from|using(?:\s+the)?\s+data\s+from|as\s+reported\s+by|via|cites?|cite[d]?\s+by)\b",

    # Geography tokens frequently present in ACS tables / captions
    "acs_geo": r"(?i)\b(PUMA|ZCTA|tracts?|block\s+groups?|county\s+subdivisions?|place code|CBSA|MSA|county|parish|borough|census\s+tract|state|place)\b",

    # Looser ACS-only shoutouts (tools & products)
    "acs_tools": r"(?i)\b(my\s+congressional\s+district|my\s+tribal\s+area|census\s+business\s+builder|narrative\s+profiles?|data\s+profiles?|comparison\s+profiles?)\b",

    # TIGER/Line & PUMS (often in methods sections)
    "acs_tiger_pums": r"(?i)\bTIGER/Line\b|\bPUMS\b|\bpublic\s+use\s+microdata\b",

    # Proximity pattern: citation words near ACS/Census mention (±250 chars)
    # Use this for has_citation_phrase style features
    "acs_near_table": r"(?is)(source|according\s+to|data\s+from|using(?:\s+the)?\s+data\s+from|as\s+reported\s+by|via|cites?)"
                      r".{0,250}(census|american\s+community\s+survey|ACS|S\d{4}|DP0?\d{2,3}|CP0?\d{2,3}|[BC]\d{5})"
                      r"|"
                      r"(census|american\s+community\s+survey|ACS|S\d{4}|DP0?\d{2,3}|CP0?\d{2,3}|[BC]\d{5})"
                      r".{0,250}(source|according\s+to|data\s+from|using(?:\s+the)?\s+data\s+from|as\s+reported\s+by|via|cites?)"
}