## Get Taxi Data, Web Scraping

In [2]:
# Web scraping, get taxi data for a definite taxi ride or for a day
# https://data.cityofchicago.org/Transportation/Taxi-Trips-2024-/ajtu-isnz/about_data

In [3]:
# Module to send HTTP requests (like GET, POST, etc.) to fetch web pages or APIs
import requests


In [4]:
# Set url to the API of the City of Chicago's open data portal, and it’s
# querying a dataset using a specific trip_id. Sending GET request and
# parsing JSON response into a Python list of dictionaries
url = "https://data.cityofchicago.org/resource/ajtu-isnz.json?trip_id=000001dd3888f74eaf131b0627d367e8c5b73cc4"
response = requests.get(url)
data = response.json()
data


[{'trip_id': '000001dd3888f74eaf131b0627d367e8c5b73cc4',
  'taxi_id': 'd5fc0adc436b269c2ce07e37eff1af604a963c8538536bd6be39f0b399c0a6b14bafdaca01d34d939738ae4df46ddbeaf7b83aeb4506199bb645112e791e07e9',
  'trip_start_timestamp': '2024-02-26T14:30:00.000',
  'trip_end_timestamp': '2024-02-26T15:15:00.000',
  'trip_seconds': '2640',
  'trip_miles': '16.3',
  'pickup_community_area': '76',
  'dropoff_community_area': '5',
  'fare': '42.75',
  'tips': '12.05',
  'tolls': '0',
  'extras': '5',
  'trip_total': '59.8',
  'payment_type': 'Credit Card',
  'company': 'Taxi Affiliation Services',
  'pickup_centroid_latitude': '41.980264315',
  'pickup_centroid_longitude': '-87.913624596',
  'pickup_centroid_location': {'type': 'Point',
   'coordinates': [-87.913624596, 41.9802643146]},
  'dropoff_centroid_latitude': '41.947791586',
  'dropoff_centroid_longitude': '-87.683834942',
  'dropoff_centroid_location': {'type': 'Point',
   'coordinates': [-87.6838349425, 41.9477915865]}}]

In [5]:
# GET Request and parse JSON from URL with filters.
# $where clause to filter trips that started on October 31, 2024, between
# 00:00:00 and 23:59:59. $limit=30000 to return up to 30,000 records (because
# APIs often limit results unless you ask for more).
url = (
    f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '2024-10-31T00:00:00' "
    f"AND trip_start_timestamp <= '2024-10-31T23:59:59'&$limit=30000"
)
response = requests.get(url)
data = response.json()

len(data)


17992

In [6]:
# requests - used to send HTTP requests to get data from an API.
# datetime - handles date and time.
# relativedelta - allows more flexible date manipulations
#                 (e.g., subtracting months).
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Calculate date two months ago and format the resulting date to a string
# like '2025-02-20'
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

# Build API URL, send request and parse response
url = (
    f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' "
    f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'&$limit=30000"
)
response = requests.get(url)
data = response.json()


In [7]:
# Display result
data


[{'trip_id': '4e39f4210373d96df0c2792554f2f4b483a46f39',
  'taxi_id': '477e2191cd2213e2db9413d810936b2f24a73c8e5270594c724f811dbe4cfea5b58519d789ac8fb30d3ac6ee375721019e2e793a889be528f59327e0cc32ea94',
  'trip_start_timestamp': '2025-02-23T23:45:00.000',
  'trip_end_timestamp': '2025-02-24T00:00:00.000',
  'trip_seconds': '830',
  'trip_miles': '0',
  'dropoff_community_area': '33',
  'fare': '13.25',
  'tips': '0',
  'tolls': '0',
  'extras': '0',
  'trip_total': '15.75',
  'payment_type': 'Cash',
  'company': 'Tac - Yellow Cab Association',
  'dropoff_centroid_latitude': '41.857183858',
  'dropoff_centroid_longitude': '-87.620334624',
  'dropoff_centroid_location': {'type': 'Point',
   'coordinates': [-87.6203346241, 41.8571838585]}},
 {'trip_id': '51d44b4e2e3cf13652298f7266c8420ac753e0c0',
  'taxi_id': 'd4e172502a18df80190c19176a06c0d9344f71abdbe521e964ad99caae1c73860f3f1f7cc96e1dcac410fbe9aba385af639ef03c028b0453429bbfb67e78010b',
  'trip_start_timestamp': '2025-02-23T23:45:00.000'

In [8]:
# Formulate this as a standalone function
from typing import Dict
def get_taxi_data(formatted_datetime: str) -> Dict:
    """Build API URL, send request and parse response
    for taxi_data  
    Args:
        formatted_datetime (str):
        Date to retrieve taxi_data in a string like '2025-02-20'
    Returns:
        Dict:
        Response JSOn content
    """
    taxi_url = (
        f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
        f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' "
        f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'&$limit=30000"
    )
    response = requests.get(taxi_url)
    taxi_data = response.json()
    return taxi_data
