# Geospatial Enrichment Notebook

This notebook enriches sales data with geospatial information by calculating the distance from each property to the nearest CTA rail station.

In [1]:
# Import required libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from scipy.spatial import KDTree
from shapely.geometry import Point

# Define directory paths
DATA_DIR = '../data/raw/'
INTERIM_DIR = '../data/interim/'
PROCESSED_DIR = '../data/processed/'

## Setup and Import Libraries

In [2]:
# Load PIN locations with coordinates
df_pin = pd.read_csv('../data/interim/universe_pin.csv')
df_pin['pin10'] = df_pin['pin10'].astype(int)
df_pin

Unnamed: 0,pin10,lon,lat
0,2503106015,-87.623728,41.733446
1,2510117011,-87.616102,41.717261
2,2022402038,-87.610924,41.771698
3,2034101010,-87.622505,41.750465
4,2502207033,-87.587231,41.735210
...,...,...,...
548502,1909325017,-87.760753,41.793282
548503,1330205028,-87.789191,41.936690
548504,1334121012,-87.738512,41.919665
548505,1307225013,-87.793550,41.977342


## Load PIN Location Data

In [3]:
# Load raw sales data
df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data_raw.csv'))
df_sales

  df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data_raw.csv'))


Unnamed: 0,pin,year,township_code,nbhd,class,sale_date,is_mydec_date,sale_price,doc_no,deed_type,mydec_deed_type,seller_name,is_multisale,num_parcels_sale,buyer_name,sale_type,sale_filter_same_sale_within_365,sale_filter_less_than_10k,sale_filter_deed_type,row_id
0,31012140340000,2000.0,32,32050,278,2000-04-01T00:00:00.000,False,177500.0,317676,Trustee,,,False,1.0,,LAND AND BUILDING,False,False,False,96289215
1,14291030261014,2000.0,73,73150,299,2000-02-01T00:00:00.000,False,315000.0,326770,Warranty,,,False,1.0,,LAND AND BUILDING,False,False,False,97142869
2,13164060180000,2000.0,71,71101,203,2000-06-01T00:00:00.000,False,192000.0,519440,Warranty,,,False,1.0,,LAND AND BUILDING,False,False,False,96585660
3,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,,US BANK,False,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542
4,19354040600000,2016.0,72,72200,205,2016-08-01T00:00:00.000,False,1.0,1625129009,Warranty,,,True,5.0,,LAND AND BUILDING,True,True,False,98154864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620984,24311140010000,2025.0,39,39260,203,2025-09-17T00:00:00.000,True,393500.0,2529718179,Warranty,,KATHERINE SHIELDS,False,1.0,PAIGE GABRIEL,,False,False,False,7780596
2620985,24121210630000,2025.0,39,39100,234,2025-02-21T00:00:00.000,True,260000.0,2529722092,Trustee,,,False,1.0,MARTIN &amp; LAURA BERNAL,,False,False,False,7780598
2620986,24031310170000,2025.0,39,39060,210,2025-09-23T00:00:00.000,True,298000.0,2529722269,Warranty,,,False,1.0,DEREK ROCHE,,False,False,False,7780599
2620987,24161120310000,2025.0,39,39122,207,2025-09-22T00:00:00.000,True,430000.0,2529722273,Warranty,,STEVEN G. MCGOWAN,False,1.0,MARC RICE,,False,False,False,7780600


## Load Sales Data

In [4]:
# Extract first 10 digits of PIN for matching
df_sales['pin10'] = df_sales['pin'].astype(str).str[:10].astype(int)
df_sales.dtypes

pin                                   int64
year                                float64
township_code                         int64
nbhd                                  int64
class                                object
sale_date                            object
is_mydec_date                          bool
sale_price                          float64
doc_no                               object
deed_type                            object
mydec_deed_type                      object
seller_name                          object
is_multisale                           bool
num_parcels_sale                    float64
buyer_name                           object
sale_type                            object
sale_filter_same_sale_within_365       bool
sale_filter_less_than_10k              bool
sale_filter_deed_type                  bool
row_id                                int64
pin10                                 int64
dtype: object

## Merge Sales and Location Data

In [5]:
# Merge sales data with PIN coordinates
df_merged = pd.merge(df_sales, df_pin, on='pin10', how='left')
df_merged

Unnamed: 0,pin,year,township_code,nbhd,class,sale_date,is_mydec_date,sale_price,doc_no,deed_type,...,num_parcels_sale,buyer_name,sale_type,sale_filter_same_sale_within_365,sale_filter_less_than_10k,sale_filter_deed_type,row_id,pin10,lon,lat
0,31012140340000,2000.0,32,32050,278,2000-04-01T00:00:00.000,False,177500.0,317676,Trustee,...,1.0,,LAND AND BUILDING,False,False,False,96289215,3101214034,-87.677174,41.551951
1,14291030261014,2000.0,73,73150,299,2000-02-01T00:00:00.000,False,315000.0,326770,Warranty,...,1.0,,LAND AND BUILDING,False,False,False,97142869,1429103026,-87.660681,41.939158
2,13164060180000,2000.0,71,71101,203,2000-06-01T00:00:00.000,False,192000.0,519440,Warranty,...,1.0,,LAND AND BUILDING,False,False,False,96585660,1316406018,-87.748956,41.959141
3,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,...,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542,2423302037,-87.714163,41.683795
4,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,...,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542,2423302037,-87.714163,41.683795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631684,24311140010000,2025.0,39,39260,203,2025-09-17T00:00:00.000,True,393500.0,2529718179,Warranty,...,1.0,PAIGE GABRIEL,,False,False,False,7780596,2431114001,-87.791331,41.655395
2631685,24121210630000,2025.0,39,39100,234,2025-02-21T00:00:00.000,True,260000.0,2529722092,Trustee,...,1.0,MARTIN &amp; LAURA BERNAL,,False,False,False,7780598,2412121063,,
2631686,24031310170000,2025.0,39,39060,210,2025-09-23T00:00:00.000,True,298000.0,2529722269,Warranty,...,1.0,DEREK ROCHE,,False,False,False,7780599,2403131017,-87.736519,41.728228
2631687,24161120310000,2025.0,39,39122,207,2025-09-22T00:00:00.000,True,430000.0,2529722273,Warranty,...,1.0,MARC RICE,,False,False,False,7780600,2416112031,,


In [6]:
# Check merged columns
df_merged.columns

Index(['pin', 'year', 'township_code', 'nbhd', 'class', 'sale_date',
       'is_mydec_date', 'sale_price', 'doc_no', 'deed_type', 'mydec_deed_type',
       'seller_name', 'is_multisale', 'num_parcels_sale', 'buyer_name',
       'sale_type', 'sale_filter_same_sale_within_365',
       'sale_filter_less_than_10k', 'sale_filter_deed_type', 'row_id', 'pin10',
       'lon', 'lat'],
      dtype='object')

In [7]:
# Remove properties without valid coordinates
df_merged = df_merged.dropna(subset=['lon', 'lat'])
print(f"Properties with valid coordinates: {len(df_merged):,}")
df_merged

Properties with valid coordinates: 1,785,902


Unnamed: 0,pin,year,township_code,nbhd,class,sale_date,is_mydec_date,sale_price,doc_no,deed_type,...,num_parcels_sale,buyer_name,sale_type,sale_filter_same_sale_within_365,sale_filter_less_than_10k,sale_filter_deed_type,row_id,pin10,lon,lat
0,31012140340000,2000.0,32,32050,278,2000-04-01T00:00:00.000,False,177500.0,317676,Trustee,...,1.0,,LAND AND BUILDING,False,False,False,96289215,3101214034,-87.677174,41.551951
1,14291030261014,2000.0,73,73150,299,2000-02-01T00:00:00.000,False,315000.0,326770,Warranty,...,1.0,,LAND AND BUILDING,False,False,False,97142869,1429103026,-87.660681,41.939158
2,13164060180000,2000.0,71,71101,203,2000-06-01T00:00:00.000,False,192000.0,519440,Warranty,...,1.0,,LAND AND BUILDING,False,False,False,96585660,1316406018,-87.748956,41.959141
3,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,...,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542,2423302037,-87.714163,41.683795
4,24233020370000,2014.0,39,39250,100,2014-06-01T00:00:00.000,False,500.0,1427529079,Other,...,1.0,MY OWN DOMINION LP,LAND,False,True,False,96639542,2423302037,-87.714163,41.683795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631671,24334031121032,2025.0,39,39210,299,2025-09-30T00:00:00.000,True,169400.0,2529722271,Warranty,...,2.0,,,False,False,False,7780580,2433403112,-87.744912,41.649975
2631675,24191200300000,2025.0,39,39151,205,2025-10-03T00:00:00.000,True,418000.0,2529724109,Warranty,...,1.0,GERARDO ALVAREZ,,False,False,False,7780587,2419120030,-87.795794,41.683298
2631683,24184210851002,2025.0,39,39160,299,2025-10-17T00:00:00.000,True,113000.0,2529720406,Warranty,...,1.0,DOROTHY KENDALL,,False,False,False,7780595,2418421085,-87.784802,41.692563
2631684,24311140010000,2025.0,39,39260,203,2025-09-17T00:00:00.000,True,393500.0,2529718179,Warranty,...,1.0,PAIGE GABRIEL,,False,False,False,7780596,2431114001,-87.791331,41.655395


In [8]:
# Load CTA station locations
CTA_FILE_PATH = os.path.join(DATA_DIR, 'cta_l_stops.geojson')
gdf_cta = gpd.read_file(CTA_FILE_PATH)
# Ensure consistent coordinate system (WGS84)
gdf_cta.to_crs(epsg=4326, inplace=True)
print(f"Total CTA stations loaded: {len(gdf_cta)}")
gdf_cta.head()

Total CTA stations loaded: 145


Unnamed: 0,:id,:version,:created_at,:updated_at,station_id,longname,lines,address,ada,pknrd,point_x,point_y,legend,:@computed_region_vrxf_vc4k,:@computed_region_6mkv_f3dw,:@computed_region_bdys_3d7i,:@computed_region_8hcu_yrd4,:@computed_region_rpca_8um6,geometry
0,row-jtkh_kz3g_gsji,rv-shxb_fkis.xavt,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,970,Cicero-Congress,Blue Line (Congress),720 S. Cicero Avenue,False,False,1144440.97667316,1896352.78926296,Blue Line,26.0,22216,61.0,29.0,32.0,POINT (-87.74517 41.87161)
1,row-cfsn~mxvt_w4wf,rv-rw9y-rmif_beim,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,20,Harlem-Lake,Green Line (Lake),1 S. Harlem Avenue,True,False,1128608.76033842,1901803.39560403,Green Line,,26611,,,,POINT (-87.80318 41.88685)
2,row-f6sv~zxju-i63i,rv-emn6~sb3g.pgef,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,610,Ridgeland,Green Line (Lake),36 N. Ridgeland Avenue,False,False,1133921.86037537,1901950.19138017,Green Line,,26615,,,,POINT (-87.78366 41.88716)
3,row-yhzj~n2bj-tfza,rv-e26p_mg7s.gmcc,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,230,Cumberland,Blue Line,5800 N. Cumberland Avenue,True,True,1118914.13069739,1937256.04587705,Blue Line,75.0,22243,64.0,41.0,17.0,POINT (-87.83803 41.98429)
4,row-yh2w_ey4u_mpke,rv-aev4_pg73~6k3c,2024-08-16 21:02:54.729000+00:00,2024-08-16 21:02:58.336000+00:00,1700,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green",29 N. Wabash,True,False,1176812.47724731,1900828.68763513,Multiple Lines,38.0,14310,580.0,34.0,41.0,POINT (-87.62619 41.88322)


## Load CTA Station Data

In [9]:
# Create Point geometries from lon/lat coordinates
geometry = [Point(xy) for xy in zip(df_merged['lon'], df_merged['lat'])]

## Convert to Geospatial Data

In [10]:
# Create GeoDataFrame for spatial analysis
gdf_properties = gpd.GeoDataFrame(
    df_merged,
    geometry=geometry,
    crs="EPSG:4326"
)
print(f"Properties converted to GeoDataFrame. Shape: {gdf_properties.shape}")

Properties converted to GeoDataFrame. Shape: (1785902, 24)


In [11]:
# Prepare coordinate arrays for KDTree
cta_coordinates = np.array(list(zip(gdf_cta.geometry.x, gdf_cta.geometry.y)))
property_coordinates = np.array(list(zip(gdf_properties.geometry.x, gdf_properties.geometry.y)))
print(f"CTA stations: {len(cta_coordinates)}, Properties: {len(property_coordinates):,}")

CTA stations: 145, Properties: 1,785,902


## Calculate Distance to Nearest CTA Station

Using KDTree algorithm for efficient nearest neighbor search.

In [12]:
# Build KDTree for fast nearest neighbor search
tree = KDTree(cta_coordinates)
# Find nearest station for each property
distances, indices = tree.query(property_coordinates, k=1)
print(f"Distance calculation complete for {len(distances):,} properties")

Distance calculation complete for 1,785,902 properties


In [13]:
# Store distance in degrees and nearest station info
gdf_properties['min_distance_deg'] = distances
nearest_stations = gdf_cta.iloc[indices].reset_index(drop=True)
gdf_properties['nearest_cta_stop'] = nearest_stations['longname']
gdf_properties['nearest_cta_lines'] = nearest_stations['lines']

## Add Distance and Station Information

In [14]:
# Convert degrees to meters (approximate)
M_PER_DEGREE = 111111  # Approximate meters per degree at Chicago's latitude
gdf_properties['min_distance_meters'] = gdf_properties['min_distance_deg'] * M_PER_DEGREE
print(f"Distance range: {gdf_properties['min_distance_meters'].min():.0f}m to {gdf_properties['min_distance_meters'].max():.0f}m")
gdf_properties[['pin', 'sale_price', 'min_distance_meters', 'nearest_cta_stop', 'nearest_cta_lines']].head()

Distance range: 11m to 34498m


Unnamed: 0,pin,sale_price,min_distance_meters,nearest_cta_stop,nearest_cta_lines
0,31012140340000,177500.0,19822.74445,95th/Dan Ryan,Red Line
1,14291030261014,315000.0,610.954725,Southport,Brown Line
2,13164060180000,192000.0,647.953208,Montrose-O'Hare,Blue Line
3,24233020370000,500.0,10854.339658,95th/Dan Ryan,Red Line
4,24233020370000,500.0,10854.339658,95th/Dan Ryan,Red Line


In [15]:
# Convert back to regular DataFrame and save
df_final = pd.DataFrame(gdf_properties.drop(columns=['geometry', 'min_distance_deg']))
output_path = os.path.join(INTERIM_DIR, 'sales_data_enriched.csv')
df_final.to_csv(output_path, index=False)
print(f"Enriched data saved to {output_path}")
print(f"Total records: {len(df_final):,}")

Enriched data saved to ../data/interim/sales_data_enriched.csv
Total records: 1,785,902


## Summary Statistics

Quick analysis of distance distribution.

In [16]:
# Display distance statistics
print("Distance to nearest CTA station (meters):")
print(df_final['min_distance_meters'].describe())
print(f"\nMedian distance: {df_final['min_distance_meters'].median():.0f}m ({df_final['min_distance_meters'].median()/1000:.2f}km)")
print(f"Properties within 500m of station: {(df_final['min_distance_meters'] <= 500).sum():,} ({(df_final['min_distance_meters'] <= 500).sum()/len(df_final)*100:.1f}%)")

Distance to nearest CTA station (meters):
count    1.785902e+06
mean     5.237312e+03
std      7.154360e+03
min      1.061554e+01
25%      6.926717e+02
50%      1.710901e+03
75%      6.611820e+03
max      3.449764e+04
Name: min_distance_meters, dtype: float64

Median distance: 1711m (1.71km)
Properties within 500m of station: 282,132 (15.8%)


In [17]:
# Most common nearest stations
print("\nTop 10 most common nearest CTA stations:")
df_final['nearest_cta_stop'].value_counts().head(10)


Top 10 most common nearest CTA stations:


nearest_cta_stop
95th/Dan Ryan     194072
Midway Airport    129123
Forest Park        61684
Cottage Grove      29625
Ashland/63rd       27130
Harlem-O'Hare      20994
Clark/Division     20587
Jefferson Park     20326
Grand/State        18178
Chicago/State      18160
Name: count, dtype: int64