In [1]:
import pandas as pd
import geopandas as gpd
import os
import requests
import time
import io

DATA_DIR = '../data/raw/'
INTERIM_DIR = '../data/interim/'
CTA_FILE_PATH = os.path.join(DATA_DIR, 'cta_l_stops.geojson')
os.makedirs(INTERIM_DIR, exist_ok=True)

In [2]:
gdf_cta = gpd.read_file(CTA_FILE_PATH)
gdf_cta

Unnamed: 0,station_id,point_y,pknrd,legend,point_x,address,ada,longname,lines,geometry
0,970,1896352.78926296,False,Blue Line,1144440.97667316,720 S. Cicero Avenue,False,Cicero-Congress,Blue Line (Congress),POINT (-87.74517 41.87161)
1,20,1901803.39560403,False,Green Line,1128608.76033842,1 S. Harlem Avenue,True,Harlem-Lake,Green Line (Lake),POINT (-87.80318 41.88685)
2,610,1901950.19138017,False,Green Line,1133921.86037537,36 N. Ridgeland Avenue,False,Ridgeland,Green Line (Lake),POINT (-87.78366 41.88716)
3,230,1937256.04587705,True,Blue Line,1118914.13069739,5800 N. Cumberland Avenue,True,Cumberland,Blue Line,POINT (-87.83803 41.98429)
4,1700,1900828.68763513,False,Multiple Lines,1176812.47724731,29 N. Wabash,True,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green",POINT (-87.62619 41.88322)
...,...,...,...,...,...,...,...,...,...,...
140,330,1903902.00526999,False,Red Line,1176288.17926282,521 N. State Street,True,Grand/State,Red Line,POINT (-87.62802 41.89167)
141,1040,1890060.7219291,False,Pink Line,1155312.29879529,1944 S. Kedzie Avenue,True,Kedzie-Douglas,Pink Line,POINT (-87.70543 41.85413)
142,1670,1901252.71835552,False,Green Line,1152209.0952084,3631 W. Lake Street,True,Conservatory-Central Park,Green Line (Lake),POINT (-87.71652 41.8849)
143,600,1889868.06298492,False,Pink Line,1147729.86397223,2019 S. Kostner Avenue,True,Kostner,Pink Line,POINT (-87.73326 41.85375)


In [3]:
print("Initial CTA Station Columns:", gdf_cta.columns.tolist())
gdf_cta

Initial CTA Station Columns: ['station_id', 'point_y', 'pknrd', 'legend', 'point_x', 'address', 'ada', 'longname', 'lines', 'geometry']


Unnamed: 0,station_id,point_y,pknrd,legend,point_x,address,ada,longname,lines,geometry
0,970,1896352.78926296,False,Blue Line,1144440.97667316,720 S. Cicero Avenue,False,Cicero-Congress,Blue Line (Congress),POINT (-87.74517 41.87161)
1,20,1901803.39560403,False,Green Line,1128608.76033842,1 S. Harlem Avenue,True,Harlem-Lake,Green Line (Lake),POINT (-87.80318 41.88685)
2,610,1901950.19138017,False,Green Line,1133921.86037537,36 N. Ridgeland Avenue,False,Ridgeland,Green Line (Lake),POINT (-87.78366 41.88716)
3,230,1937256.04587705,True,Blue Line,1118914.13069739,5800 N. Cumberland Avenue,True,Cumberland,Blue Line,POINT (-87.83803 41.98429)
4,1700,1900828.68763513,False,Multiple Lines,1176812.47724731,29 N. Wabash,True,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green",POINT (-87.62619 41.88322)
...,...,...,...,...,...,...,...,...,...,...
140,330,1903902.00526999,False,Red Line,1176288.17926282,521 N. State Street,True,Grand/State,Red Line,POINT (-87.62802 41.89167)
141,1040,1890060.7219291,False,Pink Line,1155312.29879529,1944 S. Kedzie Avenue,True,Kedzie-Douglas,Pink Line,POINT (-87.70543 41.85413)
142,1670,1901252.71835552,False,Green Line,1152209.0952084,3631 W. Lake Street,True,Conservatory-Central Park,Green Line (Lake),POINT (-87.71652 41.8849)
143,600,1889868.06298492,False,Pink Line,1147729.86397223,2019 S. Kostner Avenue,True,Kostner,Pink Line,POINT (-87.73326 41.85375)


In [4]:
name_column = 'longname' 
lines_column = 'lines'
gdf_cta['Line_Colors'] = gdf_cta[lines_column].str.replace(' Line', '', regex=False).str.replace(', ', ',', regex=False)
gdf_cta[[name_column, lines_column, 'Line_Colors', 'geometry']].head()

Unnamed: 0,longname,lines,Line_Colors,geometry
0,Cicero-Congress,Blue Line (Congress),Blue (Congress),POINT (-87.74517 41.87161)
1,Harlem-Lake,Green Line (Lake),Green (Lake),POINT (-87.80318 41.88685)
2,Ridgeland,Green Line (Lake),Green (Lake),POINT (-87.78366 41.88716)
3,Cumberland,Blue Line,Blue,POINT (-87.83803 41.98429)
4,Washington/Wabash,"Brown, Orange, Pink, Purple (Express), Green","Brown,Orange,Pink, Purple (Express),Green",POINT (-87.62619 41.88322)


In [5]:
print(f"Total unique station names: {gdf_cta[name_column].nunique()}")
print(f"Total rows in DataFrame: {len(gdf_cta)}")
print("Top 5 unique Line_Colors combinations:")
gdf_cta['Line_Colors'].value_counts().head(10)

Total unique station names: 145
Total rows in DataFrame: 145
Top 5 unique Line_Colors combinations:


Line_Colors
Red                        29
Blue                       23
Green (Lake)               12
Brown                      11
Pink                       11
Green                      10
Purple,Evanston Express     8
Orange                      7
Brown,Purple (Express)      6
Blue (Congress)             5
Name: count, dtype: int64

In [9]:
df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
df_sales['pin'] = df_sales['pin'].astype(str).str[:10]
df_sales.head()

  df_sales = pd.read_csv(os.path.join(DATA_DIR, 'sales_data.csv'))


Unnamed: 0,pin,year,township_code,neighborhood_code,class,sale_date,is_mydec_date,sale_price,sale_document_num,sale_deed_type,mydec_deed_type,sale_seller_name,is_multisale,num_parcels_sale,sale_buyer_name,sale_type,sale_filter_same_sale_within_365,sale_filter_less_than_10k,sale_filter_deed_type,row_id
0,25031060150000,2024,70,70111,234,"January 16, 2024",True,"$260,000",2402413181,Warranty,Warranty Deed,CHRISTOPHER SHAW,False,1,CALVIN GRANDBERRY,,False,False,False,7573637
1,25101170110000,2024,70,70111,203,"January 25, 2024",True,"$299,900",2402913199,Trustee,Trustee Deed,9731 FOREST LAND TRUST,False,1,KIMBERLY J ADAMS,,False,False,False,7573642
2,20224020380000,2024,70,70030,211,"January 04, 2024",True,"$385,000",2402246046,Warranty,Warranty Deed,ANNA COUNTS,False,1,JOSHUA ISHMAEL HERNANDEZ,,False,False,False,7573651
3,20341010100000,2024,70,70111,203,"January 11, 2024",True,"$237,000",2401813310,Warranty,Warranty Deed,"NCRC HOUSING REHAB FUND, LLC",False,1,TRYNELL WILLIAMS,,False,False,False,7573654
4,25022070330000,2024,70,70080,241,"January 10, 2024",True,"$455,000",2402306427,Warranty,,HAROLD COLLINS,True,2,,,False,False,False,7573660


In [None]:
UNIVERSE_DATA_ID = 'nj4t-kc8j'
UNIVERSE_API_URL = f'https://datacatalog.cookcountyil.gov/resource/{UNIVERSE_DATA_ID}.csv'
TEXT_FILE = os.path.join(INTERIM_DIR, 'universe_pin_locations.txt')

df_sales['pin10'] = df_sales['pin'].astype(str).str[:10]
all_pins = df_sales['pin10'].dropna().unique().tolist()

def fetch_pin(pin10):
    params = {
        '$limit': 1,
        '$select': 'pin10, lon, lat',
        '$where': f"pin10 = '{pin10}'"
    }
    try:
        r = requests.get(UNIVERSE_API_URL, params=params, timeout=10)
        r.raise_for_status()
        df = pd.read_csv(io.StringIO(r.text))
        if not df.empty:
            row = df.iloc[0]
            return {'pin10': str(row.get('pin10')), 'lon': row.get('lon'), 'lat': row.get('lat')}
    except Exception:
        return None
    return None

new_rows = []
sleep_seconds = 10
for i, pin in enumerate(all_pins):
    res = fetch_pin(pin)
    if res:
        new_rows.append(res)
    else:
        new_rows.append({'pin10': pin, 'lon': None, 'lat': None})
    print(new_rows[-1])
    with open(TEXT_FILE, 'a') as f:
        f.write(f"{new_rows[-1]['pin10']},{new_rows[-1]['lon']},{new_rows[-1]['lat']}\n")
    if (i + 1) % 100 == 0:
        time.sleep(sleep_seconds)

{'pin10': '2327402004.0', 'lon': np.float64(-87.8418360871), 'lat': np.float64(41.6666971243)}
{'pin10': '2407405013.0', 'lon': np.float64(-87.7855892655), 'lat': np.float64(41.7114489742)}
{'pin10': '2411304035.0', 'lon': np.float64(-87.7195102974), 'lat': np.float64(41.7081476557)}
{'pin10': '2430104002.0', 'lon': np.float64(-87.7920314167), 'lat': np.float64(41.6742472442)}
{'pin10': '2501110047.0', 'lon': np.float64(-87.5843007459), 'lat': np.float64(41.7340715076)}
{'pin10': '2530208017.0', 'lon': np.float64(-87.6645803663), 'lat': np.float64(41.6745862495)}
{'pin10': '2605321032.0', 'lon': np.float64(-87.5368379369), 'lat': np.float64(41.7188519797)}
{'pin10': '2702314003.0', 'lon': np.float64(-87.8302325732), 'lat': np.float64(41.6354392607)}
{'pin10': '2927311037.0', 'lon': np.float64(-87.6106706879), 'lat': np.float64(41.5748236415)}
{'pin10': '3017122030.0', 'lon': np.float64(-87.533291002), 'lat': np.float64(41.6105499894)}
{'pin10': '3133309011.0', 'lon': np.float64(-87.745