# 6. Data Manipulation IV - Combining Data - Quiz

## 6.0. Import `polars` and Load Data

In [6]:
import polars as pl

In [7]:
yellow_rides_column_rename_mapping = {
    "VendorID": "vendor_id",
    "RatecodeID": "ratecode_id",
    "PULocationID": "pu_location_id",
    "DOLocationID": "do_location_id",
    "Airport_fee": "airport_fee",
}
march_yellow_rides_df = (
    pl.read_parquet("../data/yellow_tripdata_2024-03.parquet")
    .rename(column_rename_mapping)
)

In [9]:
zones_df = (
    pl.read_parquet("../data/taxi_zone_lookup.parquet")
    .rename({
        "LocationID": "location_id",
        "Borough": "borough",
        "Zone": "zone",
    })
)

## 6.1. Question 1

Using the `zones_df` combined with the `march_yellow_rides_df`, which `pu_zone` `do_zone` pair had the most rides?

In [35]:
result = (
    march_yellow_rides_df
    .join(
        zones_df.select(pl.all().name.prefix("pu_")),
        on="pu_location_id",
    )
    .join(
        zones_df.with_columns(pl.all().name.prefix("do_")),
        on="do_location_id",
    )
    #### YOUR CODE HERE
    .group_by(["pu_zone", "do_zone"])
    .agg(pl.len().alias("num_rides"))
    .sort("num_rides", descending=True)
    .head(1)
)
print(result)

shape: (1, 3)
┌───────────────────────┬───────────────────────┬───────────┐
│ pu_zone               ┆ do_zone               ┆ num_rides │
│ ---                   ┆ ---                   ┆ ---       │
│ str                   ┆ str                   ┆ u32       │
╞═══════════════════════╪═══════════════════════╪═══════════╡
│ Upper East Side South ┆ Upper East Side North ┆ 21477     │
└───────────────────────┴───────────────────────┴───────────┘


1. (Upper East Side South, Upper East Side North) - correct
2. (Erasmus, Astoria) - incorrect - make sure to get the `pu_zone` `do_zone` pair with the most rides, not the least.
3. (Upper East Side North, Upper East Side South) - incorrect - make sure to get the combination with the most rides, not the second most rides.
4. (Midtown Center, Upper East Side North) - incorrect - make sure to aggregate by the right column.

## 6.2. Question 2

Find the ride with the highest `passenger_count` that started in the zone "Midtown Center" and ended in the zone "Upper East Side North". What was the `passenger_count` for that ride?

In [37]:
result = (
    march_yellow_rides_df
    #### YOUR CODE HERE
    .join(
        zones_df.select(pl.all().name.prefix("pu_")),
        on="pu_location_id",
    )
    .join(
        zones_df.with_columns(pl.all().name.prefix("do_")),
        on="do_location_id",
    )
    .filter(
        pl.col("pu_zone").eq("Midtown Center")
        .and_(pl.col("do_zone").eq("Upper East Side North"))
    )
    .select(pl.col("passenger_count").mean())
)
print(result)

shape: (1, 1)
┌─────────────────┐
│ passenger_count │
│ ---             │
│ f64             │
╞═════════════════╡
│ 1.277752        │
└─────────────────┘


1. 13.526644 - incorrect - you might be measuring average `fare_amount` instead of average `passenger_count`...
2. 1.932876 - incorrect - you might be measuring average `trip_distance` instead of average `passenger_count`...
3. 1.277752 - correct
4. 21.211721 - incorrect - you might be measuring average `total_amount` instead of average `passenger_count`...

## 6.3. Question 3