# Street Names Dataset Extraction

This notebook extracts unique street names from each city's original data and creates clean datasets.


In [None]:
import re

import geopandas as gpd
import pandas as pd
from pathlib import Path

data_dir = Path('data')


## Chicago


In [None]:
df = pd.read_csv(data_dir / 'Chicago/original_data/Chicago_Street_Names_20251209.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['Full Street Name', 'Direction', 'Street ', 'Suffix'] + name_cols))
display(df[show_cols].head())

col_used = 'Street '
chicago_streets = df[col_used].dropna().str.strip().unique().tolist()
chicago_streets = sorted([s for s in chicago_streets if s])
print(f"\n✓ Chicago: {len(chicago_streets)} unique street names (using '{col_used}')")


## Dallas


In [None]:
df = pd.read_csv(data_dir / 'Dallas/original_data/SAN__STREET_LAYER.csv', low_memory=False)
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['PREFIX', 'NAME', 'TYPE', 'SUFFIX', 'FULLSTREET'] + name_cols))
display(df[show_cols].head())

col_used = 'FULLSTREET'
dallas_streets = df[col_used].dropna().str.strip().unique().tolist()
dallas_streets = sorted([s for s in dallas_streets if s and s != 'UNNAMED STREET' and '/' not in s])
print(f"\n✓ Dallas: {len(dallas_streets)} unique street names (using '{col_used}')")


## Houston


In [None]:
df = pd.read_csv(data_dir / 'Houston/original_data/COH_RoadCenterline_-7101768773205832403.csv', low_memory=False)
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['PREFIX', 'NAME', 'ST_TYPE', 'SUFFIX', 'FULL_NAME'] + name_cols))
display(df[show_cols].head())

col_used = 'FULL_NAME'
houston_streets = df[col_used].dropna().str.strip().unique().tolist()
houston_streets = sorted([s for s in houston_streets if s])
print(f"\n✓ Houston: {len(houston_streets)} unique street names (using '{col_used}')")


## Jacksonville


In [None]:
# Jacksonville file has no header - raw text with street name + zip
with open(data_dir / 'Jacksonville/original_data/jacksonville street names - Sheet1.csv', 'r') as f:
    lines = f.readlines()
print(f"Total lines: {len(lines)} | No columns (raw text format)")
print("Sample rows:")
for line in lines[:5]:
    print(f"  {line.strip()}")

jacksonville_streets = []
for line in lines:
    line = line.strip().strip('"')
    parts = line.rsplit('  ', 1)
    if parts:
        street = parts[0].strip()
        if street:
            jacksonville_streets.append(street)

jacksonville_streets = sorted(list(set(jacksonville_streets)))
print(f"\n✓ Jacksonville: {len(jacksonville_streets)} unique street names (using 'raw text parsing')")


## Los Angeles


In [None]:
df = pd.read_csv(data_dir / 'LA/original_data/Street_Names_20251209.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['Street Name', 'Street Suffix', 'Street Suffix Direction', 'Official Street Name'] + name_cols))
display(df[show_cols].head())

col_used = 'Official Street Name'
la_streets = df[col_used].dropna().str.strip().unique().tolist()
la_streets = sorted([s for s in la_streets if s])
print(f"\n✓ Los Angeles: {len(la_streets)} unique street names (using '{col_used}')")


## NYC


In [None]:
gdb_path = data_dir / 'NYC/original_data/lion.gdb'
df = gpd.read_file(gdb_path, layer='lion')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['Street', 'SAFStreetName', 'StreetCode'] + name_cols))
display(df[show_cols].head())

col_used = 'Street'
nyc_streets = df[col_used].dropna().str.strip().unique().tolist()
nyc_streets = sorted([s for s in nyc_streets if s])
print(f"\n✓ NYC: {len(nyc_streets)} unique street names (using '{col_used}')")


## Philadelphia


In [None]:
df = pd.read_csv(data_dir / 'Philadelphia/original_data/Street_Centerline.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['pre_dir', 'st_name', 'st_type', 'suf_dir', 'streetlabe', 'stname'] + name_cols))
display(df[show_cols].head())

col_used = 'st_name'
philly_streets = df[col_used].dropna().str.strip().unique().tolist()
philly_streets = sorted([s for s in philly_streets if s])
print(f"\n✓ Philadelphia: {len(philly_streets)} unique street names (using '{col_used}')")


## Phoenix


In [None]:
df = pd.read_csv(data_dir / 'Phoenix/original_data/Street_Name_Labels.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['PREFIX', 'NAME', 'TYPE', 'SUFFIX', 'FULLNAME', 'ALIAS_NAME'] + name_cols))
display(df[show_cols].head())

col_used = 'FULLNAME'
phoenix_streets = df[col_used].dropna().str.strip().unique().tolist()
phoenix_streets = sorted([s for s in phoenix_streets if s])
print(f"\n✓ Phoenix: {len(phoenix_streets)} unique street names (using '{col_used}')")


## San Antonio


In [None]:
df = pd.read_csv(data_dir / 'SanAntonio/original_data/Streets_-5650981568868605735.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['MSAG_NAME', 'FROM_STREET', 'TO_STREET'] + name_cols))
display(df[show_cols].head())

col_used = 'MSAG_NAME'
sanantonio_streets = df[col_used].dropna().str.strip().unique().tolist()
sanantonio_streets = sorted([s for s in sanantonio_streets if s])
print(f"\n✓ San Antonio: {len(sanantonio_streets)} unique street names (using '{col_used}')")


## San Diego


In [None]:
df = pd.read_csv(data_dir / 'SanDiego/original_data/roads_datasd.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['rd20full', 'rd30full', 'rd20pred', 'rd20name', 'rd20sfx'] + name_cols))
display(df[show_cols].head())

col_used = 'rd30full'
sandiego_streets = df[col_used].dropna().str.strip().unique().tolist()
sandiego_streets = sorted([s for s in sandiego_streets if s and s != 'ALLEY'])
print(f"\n✓ San Diego: {len(sandiego_streets)} unique street names (using '{col_used}')")


## San Jose


In [None]:
df = pd.read_csv(data_dir / 'SanJose/original_data/Streets.csv')
df['corename'] = df['FULLNAME'].apply(lambda x: " ".join(x.split(' ')[:-1]))
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
#remove suffix
show_cols = list(dict.fromkeys(['FULLNAME', 'STREETMASTERID', 'corename'] + name_cols))

display(df[show_cols].head())

col_used = 'corename'
sanjose_streets = df[col_used].dropna().str.strip().unique().tolist()
sanjose_streets = sorted([s for s in sanjose_streets if s])
print(f"\n✓ San Jose: {len(sanjose_streets)} unique street names (using '{col_used}')")


## San Francisco


In [None]:
df = pd.read_csv(data_dir / 'SF/original_data/Street_Names_20251209.csv')
print(f"Shape: {df.shape} | Columns: {list(df.columns)}")
name_cols = [c for c in df.columns if 'name' in c.lower()]
show_cols = list(dict.fromkeys(['FullStreetName', 'StreetName', 'StreetType', 'PostDirection'] + name_cols))
display(df[show_cols].head())

col_used = 'FullStreetName'
sf_streets = df[col_used].dropna().str.strip().unique().tolist()
sf_streets = sorted([s for s in sf_streets if s])
print(f"\n✓ San Francisco: {len(sf_streets)} unique street names (using '{col_used}')")


In [None]:
## Combine All Street Names

In [None]:
# Load USPS Street Suffix Abbreviations from CSV (source: Publication 28)
# https://pe.usps.com/text/pub28/28apc_002.htm
usps_suffixes_df = pd.read_csv(data_dir / 'usps_street_suffixes.csv')

# Build set of all valid suffixes (primary names, common abbreviations, and USPS standard)
SUFFIXES = set()
for _, row in usps_suffixes_df.iterrows():
    SUFFIXES.add(row['primary_name'].upper())
    SUFFIXES.add(row['usps_abbrev'].upper())
    for abbrev in str(row['common_abbrev']).split(';'):
        if abbrev.strip():
            SUFFIXES.add(abbrev.strip().upper())

print(f"Loaded {len(SUFFIXES)} unique street suffixes from USPS data")

# Directional indicators
DIRECTIONS = {'N', 'S', 'E', 'W', 'NE', 'NW', 'SE', 'SW', 
              'NORTH', 'SOUTH', 'EAST', 'WEST', 'NORTHEAST', 'NORTHWEST', 'SOUTHEAST', 'SOUTHWEST'}

# Pattern for numbered streets (1ST, 2ND, 3RD, 4TH, 42ND, 100TH, etc.)
NUMBERED_PATTERN = re.compile(r'^\d+(ST|ND|RD|TH)?$', re.IGNORECASE)

def parse_street_name(street_name):
    """Parse a street name into components: suffix, direction, corename, is_numbered."""
    if not street_name:
        return None, None, None, False
    
    parts = street_name.upper().split()
    if not parts:
        return None, None, None, False
    
    direction = None
    suffix = None
    core_parts = parts.copy()
    
    # Check for prefix direction
    if parts[0] in DIRECTIONS:
        direction = parts[0]
        core_parts = core_parts[1:]
    
    # Check for suffix direction (at the end)
    if core_parts and core_parts[-1] in DIRECTIONS:
        if direction is None:
            direction = core_parts[-1]
        else:
            direction = f"{direction}-{core_parts[-1]}"  # Both prefix and suffix direction
        core_parts = core_parts[:-1]
    
    # Check for street type suffix
    if core_parts and core_parts[-1] in SUFFIXES:
        suffix = core_parts[-1]
        core_parts = core_parts[:-1]
    
    # Core name is what's left
    corename = ' '.join(core_parts) if core_parts else None
    
    # Check if numbered street
    is_numbered = False
    if corename:
        core_first = corename.split()[0] if corename.split() else ''
        is_numbered = bool(NUMBERED_PATTERN.match(core_first))
    
    return suffix, direction, corename, is_numbered

# Combine all datasets
all_streets = []

datasets = {
    'Chicago': chicago_streets,
    'Dallas': dallas_streets,
    'Houston': houston_streets,
    'Jacksonville': jacksonville_streets,
    'Los Angeles': la_streets,
    'NYC': nyc_streets,
    'Philadelphia': philly_streets,
    'Phoenix': phoenix_streets,
    'San Antonio': sanantonio_streets,
    'San Diego': sandiego_streets,
    'San Jose': sanjose_streets,
    'San Francisco': sf_streets,
}

for city, streets in datasets.items():
    for street in streets:
        suffix, direction, corename, is_numbered = parse_street_name(street)
        all_streets.append({
            'street_name': street,
            'corename': corename,
            'suffix': suffix,
            'direction': direction,
            'is_numbered': is_numbered,
            'city': city
        })

combined_df = pd.DataFrame(all_streets)
combined_df.to_csv("combined_df.tsv", sep="\t", index=False)

# ============================================================================
# FILTERING: Remove problematic street names
# ============================================================================

def is_valid_street_name(row):
    """Check if a street name should be included (returns True if valid)."""
    street = str(row['street_name']).upper() if pd.notna(row['street_name']) else ''
    corename = str(row['corename']).upper() if pd.notna(row['corename']) else ''
    
    # Exclude numbered streets
    if row['is_numbered']:
        return False
    
    # Exclude empty/NaN corenames
    if not corename or corename == 'NAN' or corename.strip() == '':
        return False
    
    # Exclude single-letter street names (A ST, B ST, C ST, etc.)
    if len(corename.strip()) == 1:
        return False
    
    # -------------------------------------------------------------------------
    # Exclude highways, expressways, ramps, thruways
    # -------------------------------------------------------------------------
    highway_patterns = [
        r'^I\d+',           # Interstate highways (I10, I57, I95)
        r'^US\d+',          # US routes (US1, US101)
        r'^SR\d+',          # State routes (SR99)
        r'^HWY\s*\d+',      # Highway numbers
        r'^ROUTE\s*\d+',    # Route numbers
    ]
    for pattern in highway_patterns:
        if re.match(pattern, street):
            return False
    
    # Keywords that indicate highways/ramps/infrastructure (not real street names)
    exclude_keywords = [
        'EXPWY', 'EXPY', 'EXPRESSWAY',  # Expressways
        'THRWY', 'THWY', 'THRUWAY', 'THROUGHWAY',  # Thruways
        'FRWY', 'FREEWAY', 'FWY',  # Freeways
        'RAMP',  # Ramps
        'EXIT', ' ET ',  # Exit indicators
        ' NB ', ' SB ', ' EB ', ' WB ',  # Directional traffic (northbound, etc.)
        ' NB$', ' SB$', ' EB$', ' WB$',  # Directional at end
        '^NB ', '^SB ', '^EB ', '^WB ',  # Directional at start
        ' TO ',  # "TO" connector (e.g., "NB TO SB")
        ' ON$', ' OFF$',  # On/off ramp indicators
        ' ON ', ' OFF ',  # On/off in middle
        '@',  # Highway interchange notation
        'UNNAMED',  # Unnamed streets
        'AIRTRAIN', 'SUBWAY', ' LINE$', ' LINE ',  # Transit lines
        'CONNECTOR', 'INTERCHANGE',  # Infrastructure
        '//',  # Double slash
    ]
    
    for keyword in exclude_keywords:
        if keyword.startswith('^'):
            if re.match(keyword[1:], street):
                return False
        elif keyword.endswith('$'):
            if re.search(keyword[:-1] + '$', street):
                return False
        elif keyword in street:
            return False
    
    # Exclude streets with slashes (often combined routes/directions)
    if '/' in street:
        return False
    
    # Exclude streets ending in single letters like "D", "G-H" (unit/building designators)
    # Match pattern: ends with space + single letter or letter-letter
    if re.search(r'\s+[A-Z]$', corename) or re.search(r'\s+[A-Z]-[A-Z]$', corename):
        return False
    
    # Exclude alphanumeric codes (like A143, 040, 8025B)
    if re.search(r'^[A-Z]?\d{2,}[A-Z]?$', corename):
        return False
    if re.search(r'\d{3,}[A-Z]?$', street):  # Ends with 3+ digit number
        return False
    
    return True

# Apply the filter
filtered_df = combined_df[combined_df.apply(is_valid_street_name, axis=1)].copy()

print(f"Original: {len(combined_df):,} streets")
print(f"After filtering: {len(filtered_df):,} streets")
print(f"Removed: {len(combined_df) - len(filtered_df):,} streets ({(len(combined_df) - len(filtered_df)) / len(combined_df) * 100:.1f}%)")

# Sample 100 per city from the filtered data
sampled_df = filtered_df.groupby("city").sample(n=100, random_state=589208)
sampled_df.to_csv("street_names_100_per_city_random_state_589208.tsv", sep="\t", index=False)
print(f"\nSampled: {len(sampled_df)} streets (100 per city)")
print(f"Unique corenames: {sampled_df['corename'].nunique()}")



In [None]:
prefixes = pd.read_csv("prefixes.csv")

# Create a cycling iterator for even distribution of prefixes
prefix_cycle_idx = [0]  # Use list to allow mutation in closure

def phrase_generator(x):
    idx = prefix_cycle_idx[0] % len(prefixes)
    prefix_cycle_idx[0] += 1
    
    row = prefixes.iloc[idx]
    prefix = row['prefix']
    question = row['question']
    if question == "yes":
        return prefix + " " + x + "?"
    else:
        return prefix + " " + x + "."


In [None]:
# Sample 30 per city for study
sampled_df = filtered_df.groupby("city").sample(n=30, random_state=9872).sample(frac=1).reset_index(drop=True)
prefix_cycle_idx[0] = 0  # Reset counter before generating phrases
sampled_df['phrase'] = sampled_df['corename'].apply(lambda x: f"{phrase_generator(x)}")
sampled_df.to_csv("street_names_30_per_city_random_state_9872.tsv", sep="\t", index=False)
print(f"\nSampled: {len(sampled_df)} streets (30 per city) for study")
print(f"Unique corenames: {sampled_df['corename'].nunique()}")

# Validate even distribution of prefixes
prefix_counts = sampled_df['phrase'].apply(lambda p: next((row['prefix'] for _, row in prefixes.iterrows() if p.startswith(row['prefix'])), None)).value_counts()
print(f"\nPrefix distribution (expected {len(sampled_df) // len(prefixes)} each):")
print(prefix_counts.to_string())
assert prefix_counts.nunique() == 1, f"Uneven distribution! Counts: {prefix_counts.unique()}"
print("✓ All prefixes used evenly")

In [None]:
sampled_df