In [None]:
import requests
import pandas as pd
from paths import RAW_DATA_DIR, TRANSFORMED_DATA_DIR
from typing import Optional, List, Tuple


def fetch_chicago_data(year=None, month=None, day=None):
    """
    Fetches Chicago data from the API with an option to filter by year, month, or day.
    """
    BASE_URL = "https://data.cityofchicago.org/resource/ajtu-isnz.json"
    LIMIT = 50000  # Maximum rows per request
    OFFSET = 0
    write_header = True  # Ensures the header is written only once
    
    # Build date filter based on provided parameters
    filters = []
    if year:
        start_date = f"{year}-01-01"
        end_date = f"{year}-12-31"
        if month:
            start_date = f"{year}-{month:02d}-01"
            if month == 12:
                end_date = f"{year}-12-31"
            else:
                end_date = f"{year}-{month+1:02d}-01"
        if day:
            start_date = f"{year}-{month:02d}-{day:02d}"
            end_date = start_date  # Only fetch this specific day
        filters.append(f"trip_start_timestamp >= '{start_date}T00:00:00.000'")
        filters.append(f"trip_start_timestamp <= '{end_date}T23:59:59.999'")
    where_clause = " AND ".join(filters) if filters else None
    
    # Generate dynamic file name based on filters
    file_name = "rides"
    if year:
        file_name += f"_{year}"
        if month:
            file_name += f"_{month:02d}"
            if day:
                file_name += f"_{day:02d}"
    file_name += ".csv"
    
    all_columns = None
    
    while True:
        print(f"🔍 Fetching records from {OFFSET} to {OFFSET + LIMIT}...")
        
        params = {"$limit": LIMIT, "$offset": OFFSET}
        if where_clause:
            params["$where"] = where_clause  # Apply date filter
        
        response = requests.get(BASE_URL, params=params)
        
        if response.status_code != 200:
            print(f"❌ Error {response.status_code}: {response.text}")
            break
        
        batch = response.json()
        
        if not batch:
            print("✅ All available records within the date range have been downloaded.")
            break
        
        df_batch = pd.DataFrame(batch)
        
        if all_columns is None:
            all_columns = df_batch.columns.tolist()
        
        df_batch = df_batch.reindex(columns=all_columns, fill_value="")
        df_batch = df_batch.astype(str)
        
        df_batch.to_csv(RAW_DATA_DIR / file_name, mode='a', index=False, header=write_header, sep=',')
        print(f"📂 {OFFSET + len(df_batch)} records stored in {file_name}.")
        write_header = False  # Ensure only the first batch writes the header
        if len(batch) < LIMIT:
            break  # If fewer records than the limit, stop fetching
        
        OFFSET += LIMIT
    
    print(f"📊 Download complete. Data saved in '{file_name}'.")

🔍 Descargando registros desde 0 hasta 50000...
📂 22318 registros almacenados en chicago_data_filtered.csv.
📊 Descarga completa. Datos guardados en 'chicago_data_filtered.csv'.


In [3]:
df = pd.read_csv('chicago_data_progresivo.csv')
df

  df = pd.read_csv('chicago_data_progresivo.csv')


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,...,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,7aa6b85d7fe272c88fa0e7a53218a51229abc2c0,074ebefb524b3c9c38e7f04026cb045b9536f14db29362...,2025-02-01T00:00:00.000,2025-02-01T00:00:00.000,537.0,3.88,8.0,6.0,12.50,2.6,...,Mobile,City Service,41.899602,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.944227,-87.655998,"{'type': 'Point', 'coordinates': [-87.65599818...",,
1,fe0c5c8735c9a7d872673e924a76a8824b626084,00e34f77f4495a6128d5b3312099c8f2f4ea25c2650e83...,2025-02-01T00:00:00.000,2025-02-01T00:15:00.000,1020.0,13.20,28.0,,33.50,0.0,...,Unknown,Taxi Affiliation Services,41.874005,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754...",,,,,
2,7d2a6a8dd2b459a4a67b1ec89f85991928c76c73,8307cf9433f0293eee99c6944aeab484521d9cd9b1fce5...,2025-02-01T00:00:00.000,2025-02-01T00:00:00.000,5.0,0.00,21.0,22.0,25.00,5.1,...,Credit Card,City Service,41.938666,-87.711211,"{'type': 'Point', 'coordinates': [-87.71121059...",41.922761,-87.699155,"{'type': 'Point', 'coordinates': [-87.69915534...",,
3,699178e9b3f493f778aa2b88e9aa40a74e7cb6cc,8ef1056519939d511d24008e394f83e925d2539d668a00...,2025-02-01T00:00:00.000,2025-02-01T00:15:00.000,512.0,0.85,8.0,28.0,6.71,0.0,...,Mobile,5 Star Taxi,41.899602,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.874005,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754...",,
4,000fe0c99a77703932cb66b52837663a08dd5a9d,ff60dabe17243a25435dcaf430a8b31615374bfc6be841...,2025-02-01T00:00:00.000,2025-02-01T00:00:00.000,367.0,2.49,8.0,7.0,9.25,1.5,...,Credit Card,Flash Cab,41.899602,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.922686,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6905283,e44504084381143fd8a7e72a1f770cc4d9066a83,50fcee6711df1d794e4f337c99f44abe8109795ec69474...,2024-01-01T00:00:00.000,2024-01-01T00:15:00.000,648.0,1.49,,,8.00,28.0,...,1,12.25,Credit Card,Medallion Leasin,41.899602111,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.874005383,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754..."
6905284,e925f509aa7d6b6c34df3a2ab35f51fcb3b1c4bb,b41adca9ab700712805f97bb45aa12c1163013bc7c842c...,2024-01-01T00:00:00.000,2024-01-01T00:00:00.000,19.0,0.00,,,32.00,32.0,...,26.5,30.25,Credit Card,Flash Cab,41.878865584,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214...",41.878865584,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214..."
6905285,aa65afad59435bc4e81358e393295cb0dc487600,fb0ce19e30e712c77c57cfdb6ef729c2d2ad73225d9ec3...,2024-01-01T00:00:00.000,2024-01-01T00:15:00.000,929.0,7.03,,,8.00,77.0,...,1.5,25.8,Credit Card,Flash Cab,41.899602111,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.9867118,-87.663416,"{'type': 'Point', 'coordinates': [-87.66341640..."
6905286,67cbf4af40b12db55b3a3e4efa09f358288c0cf4,57c40509cae37a0e5e536a657cdb7f8c6824314bc466a7...,2024-01-01T00:00:00.000,2024-01-01T00:00:00.000,0.0,0.00,,,7.00,7.0,...,7,10.25,Cash,Taxi Affiliation Services,41.922686284,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872...",41.922686284,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872..."


In [2]:
import pandas as pd
df = pd.read_csv('taxi_trips.csv')

In [9]:
df.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
0,0000184e7cd53cee95af32eba49c44e4d20adcd8,f538e6b729d1aaad4230e9dcd9dc2fd9a168826ddadbd6...,01/19/2024 05:00:00 PM,01/19/2024 06:00:00 PM,4051.0,17.12,17031980000.0,17031320000.0,76.0,32.0,...,4.0,60.0,Credit Card,Flash Cab,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918)
1,000072ee076c9038868e239ca54185eb43959db0,e51e2c30caec952b40b8329a68b498e18ce8a1f40fa75c...,01/28/2024 02:30:00 PM,01/28/2024 03:00:00 PM,1749.0,12.7,,,6.0,,...,0.0,33.75,Cash,Flash Cab,41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),,,
2,000074019d598c2b1d6e77fbae79e40b0461a2fc,aeb280ef3be3e27e081eb6e76027615b0d40925b84d3eb...,01/05/2024 09:00:00 AM,01/05/2024 09:00:00 AM,517.0,3.39,,,6.0,8.0,...,1.0,14.69,Mobile,Taxicab Insurance Agency Llc,41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)
3,00007572c5f92e2ff067e6f838a5ad74e83665d3,7d21c2ca227db8f27dda96612bfe5520ab408fa9a462c8...,01/22/2024 08:45:00 AM,01/22/2024 09:30:00 AM,2050.0,15.06,,,76.0,,...,5.5,56.56,Credit Card,Globe Taxi,41.980264,-87.913625,POINT (-87.913624596 41.9802643146),,,
4,00007c3e7546e2c7d15168586943a9c22c3856cf,8ef1056519939d511d24008e394f83e925d2539d668a00...,01/18/2024 07:15:00 PM,01/18/2024 07:30:00 PM,1004.0,1.18,17031840000.0,17031840000.0,32.0,32.0,...,0.0,19.66,Mobile,5 Star Taxi,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.880994,-87.632746,POINT (-87.6327464887 41.8809944707)


In [11]:
rides = df.copy()[['Trip Start Timestamp', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude']]
rides.rename(columns={'Trip Start Timestamp':'pickup_datetime', 'Pickup Centroid Latitude':'pickup_latitude', 'Pickup Centroid Longitude':'pickup_longitude'}, inplace=True)
rides

Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude
0,01/19/2024 05:00:00 PM,41.979071,-87.903040
1,01/28/2024 02:30:00 PM,41.944227,-87.655998
2,01/05/2024 09:00:00 AM,41.944227,-87.655998
3,01/22/2024 08:45:00 AM,41.980264,-87.913625
4,01/18/2024 07:15:00 PM,41.880994,-87.632746
...,...,...,...
6905283,12/31/2024 11:30:00 AM,41.899602,-87.633308
6905284,12/31/2024 03:15:00 PM,41.954028,-87.763399
6905285,12/31/2024 10:45:00 AM,,
6905286,12/31/2024 02:00:00 PM,41.979071,-87.903040


In [15]:
rides.sort_values(by='pickup_datetime')

Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude
301950,01/01/2024 01:00:00 AM,41.980264,-87.913625
339417,01/01/2024 01:00:00 AM,41.965812,-87.655879
148688,01/01/2024 01:00:00 AM,41.944227,-87.655998
144493,01/01/2024 01:00:00 AM,41.892042,-87.631864
81893,01/01/2024 01:00:00 AM,41.899602,-87.633308
...,...,...,...
6662917,12/31/2024 12:45:00 PM,41.899602,-87.633308
6561591,12/31/2024 12:45:00 PM,41.953582,-87.723452
6888469,12/31/2024 12:45:00 PM,41.878866,-87.625192
6824025,12/31/2024 12:45:00 PM,41.899602,-87.633308


In [17]:
rides['pickup_datetime'].max(), rides['pickup_datetime'].min()

('12/31/2024 12:45:00 PM', '01/01/2024 01:00:00 AM')

In [18]:
rides.to_csv('../data/transformed/validated_rides_2024.csv')