In [None]:
# Setup
from pathlib import Path
import os

PROJECT_ROOT = Path.cwd().resolve().parents[2] if "homework" in str(Path.cwd()) else Path.cwd().resolve()

DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM   = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
MODELS_DIR     = PROJECT_ROOT / "models"
REPORTS_DIR    = PROJECT_ROOT / "docs" / "reports"
SRC_DIR        = PROJECT_ROOT / "src"

for p in [DATA_RAW, DATA_INTERIM, DATA_PROCESSED, MODELS_DIR, REPORTS_DIR, SRC_DIR]:
    p.mkdir(parents=True, exist_ok=True)

REQ = [
    "pandas>=2.0.0",
    "polars>=0.20.0",
    "pyarrow>=15.0.0",
    "duckdb>=1.0.0",
    "scikit-learn>=1.4.0",
    "matplotlib>=3.8.0",
    "mlflow>=2.16.0"
]
print("Project root:", PROJECT_ROOT)
print("Data paths ready.")

Project root: /Users/liuphoebe/Desktop/bootcamp4/bootcamp_Kexu_Liu/project/docs
Data paths ready.


In [None]:
# Ingestion
import requests
import pandas as pd
from pathlib import Path

RAW_PARQ = Path("data/raw/nycdot_speeds.parquet")
def fetch_nycdot_speeds():
    url = "https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=50000"
    save_path = "data/raw/nycdot_speeds.json"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        all_rows = data

        with open(save_path, "w") as f:
            for row in all_rows:
                f.write(pd.io.json.dumps(row) + "\n")
        print(f"üíæ Saved {len(all_rows)} rows to {save_path}")

        df = pd.read_json(save_path, lines=True)
        df.to_parquet(RAW_PARQ, index=False)
        print(f"‚úÖ Converted and saved to {RAW_PARQ}")
    else:
        print("‚ùå Failed to fetch data from API")
if not RAW_PARQ.exists():
    fetch_nycdot_speeds()
else:
    print("‚úÖ Found parquet:", RAW_PARQ)

In [None]:
# data cleaning
df = pd.read_parquet(RAW_PARQ)

df = df.drop_duplicates()
df = df.dropna(subset=['speed', 'segmentid'])

df['speed'] = df['speed'].astype(float)
df['segmentid'] = df['segmentid'].astype(str)

print(f"üßπ Cleaned data shape: {df.shape}")
df.head()

In [None]:
# EDA (Exploratory Data Analysis)
import matplotlib.pyplot as plt
import seaborn as sns

df.describe()

plt.figure(figsize=(8,5))
sns.histplot(df['speed'], bins=50, kde=True)
plt.title("Distribution of Traffic Speed")
plt.xlabel("Speed (mph)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Feature Engineering
if 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday

print("üß™ Feature columns:", ['hour', 'weekday'])

In [None]:
# Save Processed Data
df.to_parquet(PROCESSED_PARQ, index=False)
print(f"‚úÖ Saved processed data to {PROCESSED_PARQ}")

In [None]:
# Simple Modeling
hourly_speed = df.groupby('hour')['speed'].mean()

plt.figure(figsize=(8,5))
sns.lineplot(x=hourly_speed.index, y=hourly_speed.values)
plt.title("Average Speed by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Average Speed (mph)")
plt.show()

# Reflection
### üí° Hypothesis
Traffic speed is significantly lower during rush hours (8‚Äì10am, 5‚Äì7pm).

### üß† Reflection
- ‚úÖ The API provided sufficient granularity to capture temporal patterns.
- ‚ö†Ô∏è Some segments had missing or zero speeds; these require domain knowledge to filter.
- ‚úÖ Parquet format helped optimize storage and downstream processing speed.