In [None]:
import requests
import pandas as pd
import os
import json
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

DATA_DIR = '../data/raw/'

# Sales data
sales_filename = 'sales_data_raw.csv'
sales_url = 'https://datacatalog.cookcountyil.gov/resource/wvhk-k5uv.csv?$limit=3000000'

# CTA stations data
cta_filename = 'cta_l_stops.geojson'
cta_url = 'https://data.cityofchicago.org/api/v3/views/3tzw-cg4m/query.geojson'

SODA_APP_TOKEN = os.getenv("SODA_APP_TOKEN")

# Data Acquisition Notebook

This notebook downloads the required datasets:
1. Cook County Assessor - Parcel Sales (CSV)
2. CTA Rail Stations (GeoJSON)

In [None]:
# Download Cook County Sales Data
print("Downloading Cook County Sales data...")

sales_filepath = os.path.join(DATA_DIR, sales_filename)

response = requests.get(sales_url, stream=True)
response.raise_for_status()

with open(sales_filepath, 'wb') as file:
    for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)

print(f"Sales data downloaded successfully to {sales_filepath}")

In [None]:
# Download CTA Stations GeoJSON Data
print("Downloading CTA Stations GeoJSON data...")

cta_filepath = os.path.join(DATA_DIR, cta_filename)

try:
    # SODA3 requires the App Token in the header for identification
    headers = {
        'X-App-Token': SODA_APP_TOKEN
    }

    response = requests.get(cta_url, headers=headers)
    response.raise_for_status()
    geojson_data = response.json()
    
    with open(cta_filepath, 'w', encoding='utf-8') as file:
        json.dump(geojson_data, file, indent=4)
    
    print(f"CTA data downloaded successfully to {cta_filepath}")
except requests.exceptions.RequestException as e:
    print(f"Error downloading CTA data: {e}")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON from CTA data response.")

## Verify CTA Stations Data

In [None]:
# Load and preview CTA stations data
with open(os.path.join(DATA_DIR, cta_filename), 'r') as f:
    cta_data = json.load(f)

print(f"Number of CTA stations: {len(cta_data['features'])}")
print(f"\nFirst station:")
print(json.dumps(cta_data['features'][0], indent=2))

In [None]:
# Load and preview sales data
df_sales = pd.read_csv(os.path.join(DATA_DIR, sales_filename))
print(f"Sales data shape: {df_sales.shape}")
print(f"\nFirst few rows:")
df_sales.head()

## Verify Sales Data

In [None]:
# Check for missing values in sales data
print("Missing values per column:")
df_sales.isnull().sum()