# PIN Location Geocoding Notebook

This notebook geocodes property PINs by fetching their coordinates from the Cook County Parcel Universe API.

In [15]:
# Import libraries
import pandas as pd
import geopandas as gpd
import os
import requests
import time
import io

# Define directories
DATA_DIR = '../data/raw/'
INTERIM_DIR = '../data/interim/'
CTA_FILE_PATH = os.path.join(DATA_DIR, 'cta_l_stops.geojson')
os.makedirs(INTERIM_DIR, exist_ok=True)

## Setup and Configuration

In [16]:
# Load CTA stations GeoJSON file
gdf_cta = gpd.read_file(CTA_FILE_PATH)
gdf_cta

Unnamed: 0,:id,:version,:created_at,:updated_at,station_id,longname,lines,address,ada,pknrd,point_x,point_y,legend,:@computed_region_vrxf_vc4k,:@computed_region_6mkv_f3dw,:@computed_region_bdys_3d7i,:@computed_region_8hcu_yrd4,:@computed_region_rpca_8um6,geometry
0,row-jtkh_kz3g_gsji,rv-shxb_fkis.xavt,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,970,Cicero-Congress,Blue Line (Congress),720 S. Cicero Avenue,False,False,1144440.97667316,1896352.78926296,Blue Line,26,22216,61,29,32,POINT (-87.74517 41.87161)
1,row-cfsn~mxvt_w4wf,rv-rw9y-rmif_beim,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,20,Harlem-Lake,Green Line (Lake),1 S. Harlem Avenue,True,False,1128608.76033842,1901803.39560403,Green Line,,26611,,,,POINT (-87.80318 41.88685)
2,row-f6sv~zxju-i63i,rv-emn6~sb3g.pgef,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,610,Ridgeland,Green Line (Lake),36 N. Ridgeland Avenue,False,False,1133921.86037537,1901950.19138017,Green Line,,26615,,,,POINT (-87.78366 41.88716)
3,row-yhzj~n2bj-tfza,rv-e26p_mg7s.gmcc,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,230,Cumberland,Blue Line,5800 N. Cumberland Avenue,True,True,1118914.13069739,1937256.04587705,Blue Line,75,22243,64,41,17,POINT (-87.83803 41.98429)
4,row-yh2w_ey4u_mpke,rv-aev4_pg73~6k3c,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1700,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green",29 N. Wabash,True,False,1176812.47724731,1900828.68763513,Multiple Lines,38,14310,580,34,41,POINT (-87.62619 41.88322)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,row-9fuj.xhhu~wk8m,rv-5ppg~fn4z_3udc,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,330,Grand/State,Red Line,521 N. State Street,True,False,1176288.17926282,1903902.00526999,Red Line,37,21182,626,42,6,POINT (-87.62802 41.89167)
141,row-msj5.9pq5~r66e,rv-5kgk-x8t6_3tp4,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1040,Kedzie-Douglas,Pink Line,1944 S. Kedzie Avenue,True,False,1155312.29879529,1890060.7219291,Pink Line,30,21569,202,24,57,POINT (-87.70543 41.85413)
142,row-ahu9-afvq~yv3c,rv-c5i8_tuev.n457,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1670,Conservatory-Central Park,Green Line (Lake),3631 W. Lake Street,True,False,1152209.0952084,1901252.71835552,Green Line,28,21572,176,28,30,POINT (-87.71652 41.8849)
143,row-p45f_f8v8-8qab,rv-8uqd-d2nn~cdbe,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,600,Kostner,Pink Line,2019 S. Kostner Avenue,True,False,1147729.86397223,1889868.06298492,Pink Line,30,21569,753,24,57,POINT (-87.73326 41.85375)


## Load and Process CTA Station Data

In [17]:
# Check available columns
print("Initial CTA Station Columns:", gdf_cta.columns.tolist())
gdf_cta

Initial CTA Station Columns: [':id', ':version', ':created_at', ':updated_at', 'station_id', 'longname', 'lines', 'address', 'ada', 'pknrd', 'point_x', 'point_y', 'legend', ':@computed_region_vrxf_vc4k', ':@computed_region_6mkv_f3dw', ':@computed_region_bdys_3d7i', ':@computed_region_8hcu_yrd4', ':@computed_region_rpca_8um6', 'geometry']


Unnamed: 0,:id,:version,:created_at,:updated_at,station_id,longname,lines,address,ada,pknrd,point_x,point_y,legend,:@computed_region_vrxf_vc4k,:@computed_region_6mkv_f3dw,:@computed_region_bdys_3d7i,:@computed_region_8hcu_yrd4,:@computed_region_rpca_8um6,geometry
0,row-jtkh_kz3g_gsji,rv-shxb_fkis.xavt,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,970,Cicero-Congress,Blue Line (Congress),720 S. Cicero Avenue,False,False,1144440.97667316,1896352.78926296,Blue Line,26,22216,61,29,32,POINT (-87.74517 41.87161)
1,row-cfsn~mxvt_w4wf,rv-rw9y-rmif_beim,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,20,Harlem-Lake,Green Line (Lake),1 S. Harlem Avenue,True,False,1128608.76033842,1901803.39560403,Green Line,,26611,,,,POINT (-87.80318 41.88685)
2,row-f6sv~zxju-i63i,rv-emn6~sb3g.pgef,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,610,Ridgeland,Green Line (Lake),36 N. Ridgeland Avenue,False,False,1133921.86037537,1901950.19138017,Green Line,,26615,,,,POINT (-87.78366 41.88716)
3,row-yhzj~n2bj-tfza,rv-e26p_mg7s.gmcc,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,230,Cumberland,Blue Line,5800 N. Cumberland Avenue,True,True,1118914.13069739,1937256.04587705,Blue Line,75,22243,64,41,17,POINT (-87.83803 41.98429)
4,row-yh2w_ey4u_mpke,rv-aev4_pg73~6k3c,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1700,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green",29 N. Wabash,True,False,1176812.47724731,1900828.68763513,Multiple Lines,38,14310,580,34,41,POINT (-87.62619 41.88322)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,row-9fuj.xhhu~wk8m,rv-5ppg~fn4z_3udc,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,330,Grand/State,Red Line,521 N. State Street,True,False,1176288.17926282,1903902.00526999,Red Line,37,21182,626,42,6,POINT (-87.62802 41.89167)
141,row-msj5.9pq5~r66e,rv-5kgk-x8t6_3tp4,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1040,Kedzie-Douglas,Pink Line,1944 S. Kedzie Avenue,True,False,1155312.29879529,1890060.7219291,Pink Line,30,21569,202,24,57,POINT (-87.70543 41.85413)
142,row-ahu9-afvq~yv3c,rv-c5i8_tuev.n457,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1670,Conservatory-Central Park,Green Line (Lake),3631 W. Lake Street,True,False,1152209.0952084,1901252.71835552,Green Line,28,21572,176,28,30,POINT (-87.71652 41.8849)
143,row-p45f_f8v8-8qab,rv-8uqd-d2nn~cdbe,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,600,Kostner,Pink Line,2019 S. Kostner Avenue,True,False,1147729.86397223,1889868.06298492,Pink Line,30,21569,753,24,57,POINT (-87.73326 41.85375)


In [18]:
# Extract and clean line colors from legend column
name_column = 'longname' 
lines_column = 'legend'
gdf_cta['Line_Colors'] = gdf_cta[lines_column].str.replace(' Line', '', regex=False).str.replace(', ', ',', regex=False)
gdf_cta[[name_column, lines_column, 'Line_Colors', 'geometry']].head()

Unnamed: 0,longname,legend,Line_Colors,geometry
0,Cicero-Congress,Blue Line,Blue,POINT (-87.74517 41.87161)
1,Harlem-Lake,Green Line,Green,POINT (-87.80318 41.88685)
2,Ridgeland,Green Line,Green,POINT (-87.78366 41.88716)
3,Cumberland,Blue Line,Blue,POINT (-87.83803 41.98429)
4,Washington/Wabash,Multiple Lines,Multiples,POINT (-87.62619 41.88322)


In [19]:
# Display CTA station statistics
print(f"Total unique station names: {gdf_cta[name_column].nunique()}")
print(f"Total rows in DataFrame: {len(gdf_cta)}")
print("Top 5 unique Line_Colors combinations:")
gdf_cta['Line_Colors'].value_counts().head(10)

Total unique station names: 145
Total rows in DataFrame: 145
Top 5 unique Line_Colors combinations:


Line_Colors
Blue         32
Multiples    30
Red          29
Green        23
Brown        11
Pink         11
Orange        7
Yellow        2
Name: count, dtype: int64

In [20]:
# Load sales data
df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data_raw.csv'))
df_sales.head()

  df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data_raw.csv'))


Unnamed: 0,pin,year,township_code,nbhd,class,sale_date,is_mydec_date,sale_price,doc_no,deed_type,mydec_deed_type,seller_name,is_multisale,num_parcels_sale,buyer_name,sale_type,sale_filter_same_sale_within_365,sale_filter_less_than_10k,sale_filter_deed_type,row_id
0,31012140340000,2000.0,32,32050,278,2000-04-01T00:00:00.000,False,177500.0,317676,Trustee,,,False,1.0,,LAND AND BUILDING,False,False,False,96289215
1,14291030261014,2000.0,73,73150,299,2000-02-01T00:00:00.000,False,315000.0,326770,Warranty,,,False,1.0,,LAND AND BUILDING,False,False,False,97142869
2,13164060180000,2000.0,71,71101,203,2000-06-01T00:00:00.000,False,192000.0,519440,Warranty,,,False,1.0,,LAND AND BUILDING,False,False,False,96585660
3,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,,US BANK,False,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542
4,19354040600000,2016.0,72,72200,205,2016-08-01T00:00:00.000,False,1.0,1625129009,Warranty,,,True,5.0,,LAND AND BUILDING,True,True,False,98154864


## Load Sales Data

In [21]:
# Configure Cook County Universe API
UNIVERSE_DATA_ID = 'nj4t-kc8j'
UNIVERSE_API_URL = f'https://datacatalog.cookcountyil.gov/resource/{UNIVERSE_DATA_ID}.csv'
TEXT_FILE = os.path.join(INTERIM_DIR, 'universe_pin.txt')

## Setup PIN Universe API

In [22]:
# Remove invalid entries (None,None) from existing PIN file
if os.path.exists(TEXT_FILE):
    with open(TEXT_FILE, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    before = len(lines)
    cleaned = []
    for line in lines:
        s = line.strip()
        if not s:
            continue
        if 'None,None' in s:
            continue
        cleaned.append(s)

    # Write cleaned data back
    with open(TEXT_FILE, 'w', encoding='utf-8') as f:
        for ln in cleaned:
            f.write(f"{ln}\n")

    print(f"Cleaned {TEXT_FILE}: {before} -> {len(cleaned)} lines")
else:
    print(f"File not found: {TEXT_FILE}")

Cleaned ../data/interim/universe_pin.txt: 548508 -> 548508 lines


## Clean Existing PIN Data

In [23]:
# Load cleaned PINs and create set of unique pin10 values
if 'cleaned' in globals():
    src_lines = cleaned
else:
    with open(TEXT_FILE, 'r', encoding='utf-8') as f:
        src_lines = [ln.strip() for ln in f if ln.strip()]

unique_pin10 = set()
for ln in src_lines:
    # Skip header or empty lines
    if not ln or ln.lower().startswith('pin10'):
        continue
    parts = ln.split(',')
    if not parts:
        continue
    pin_raw = parts[0].strip()
    if not pin_raw or pin_raw.lower() == 'none':
        continue
    # Normalize trailing .0 (e.g. "2503106015.0" -> "2503106015")
    if pin_raw.endswith('.0'):
        pin = pin_raw[:-2]
    else:
        pin = pin_raw
    unique_pin10.add(pin)

print("Unique pin10 count:", len(unique_pin10))

Unique pin10 count: 543911


In [24]:
# Extract first 10 digits of PIN from sales data
df_sales['pin10'] = df_sales['pin'].astype(str).str[:10]
all_pins = df_sales['pin10'].dropna().unique().tolist()

## Identify Missing PINs

In [25]:
# Find PINs that need geocoding (not in existing dataset)
pins_to_check = [p for p in all_pins if p not in unique_pin10]

print(f"Total all_pins: {len(all_pins)}")
print(f"Pins already found (unique_pin10): {len(unique_pin10)}")
print(f"Missing pins_to_check count: {len(pins_to_check)}")

Total all_pins: 931789
Pins already found (unique_pin10): 543911
Missing pins_to_check count: 387878


In [26]:
# Define file path for tracking missing PINs
PINS_TO_CHECK_FILE = os.path.join(INTERIM_DIR, 'pins_to_check.txt')
# Uncomment below to save missing PINs to file
# with open(PINS_TO_CHECK_FILE, 'w', encoding='utf-8') as fh:
#     for pin in pins_to_check:
#         fh.write(f"{pin}\n")

In [29]:
# Reload pins_to_check from file if it exists
if os.path.exists(PINS_TO_CHECK_FILE):
    with open(PINS_TO_CHECK_FILE, 'r', encoding='utf-8') as fh:
        pins_to_check = [line.strip() for line in fh if line.strip()]
    print(f"Reloaded {len(pins_to_check)} pins from {PINS_TO_CHECK_FILE}")
else:
    print(f"File not found: {PINS_TO_CHECK_FILE}")

Reloaded 0 pins from ../data/interim/pins_to_check.txt


In [30]:
# Function to fetch PIN coordinates from Cook County API
def fetch_pin(pin10):
    params = {
        '$limit': 1,
        '$select': 'pin10, lon, lat',
        '$where': f"pin10 = '{pin10}'"
    }
    try:
        r = requests.get(UNIVERSE_API_URL, params=params, timeout=10)
        r.raise_for_status()
        df = pd.read_csv(io.StringIO(r.text))
        if not df.empty:
            row = df.iloc[0]
            return {'pin10': str(row.get('pin10')), 'lon': row.get('lon'), 'lat': row.get('lat')}
    except Exception:
        return None
    return None

## Fetch PIN Coordinates from API

In [31]:
# Loop through missing PINs and fetch coordinates
import random
for each_pin in pins_to_check:
    result = fetch_pin(each_pin)
    if result:
        # Append result to text file
        with open(TEXT_FILE, 'a', encoding='utf-8') as fh:
            fh.write(f"{result['pin10']},{result['lon']},{result['lat']}\n")
    pins_to_check.remove(each_pin)
    # Periodically save progress
    if random.randint(0, 30) == 0:
        os.remove(PINS_TO_CHECK_FILE)
        with open(PINS_TO_CHECK_FILE, 'w', encoding='utf-8') as fh:
            for pin in pins_to_check:
                fh.write(f"{pin}\n")

In [32]:
# Copy text file to CSV format for easier downstream processing
import shutil
CSV_FILE = os.path.join(INTERIM_DIR, 'universe_pin.csv')
shutil.copy(TEXT_FILE, CSV_FILE)

'../data/interim/universe_pin.csv'