In [2]:
import pandas as pd
import numpy as np
import polars as pl


In [40]:
df = pd.read_csv('data/nyc_taxi_2019-01.csv', 
                 usecols=['passenger_count', 'trip_distance', 'total_amount'])

In [3]:
df = pl.read_csv('data/nyc_taxi_2019-01.csv', 
                 columns=['passenger_count', 'trip_distance', 'total_amount'])

In [41]:
%%timeit
# the average cost of the 20 longest (in distance) taxirides in January 2019 (descending)
(
    df
    .sort_values(by=["trip_distance"], ascending=False)[:20]["total_amount"]
    .mean()
)

748 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit
(
    df
    .sort(by=["trip_distance"], descending=True)[:20]
    .select(pl.col("total_amount"))
    .mean()
)

178 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [42]:
%%timeit
# the average cost of the 20 longest (in distance) taxirides in January 2019 (ascending)
(
    df
    .sort_values(by=["trip_distance"])[-20:]["total_amount"]
    .mean()
)

719 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
(
    df
    .sort(by=["trip_distance"], descending=False)[-20:]
    .select(pl.col("total_amount"))
    .mean()
)

173 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
%%timeit
# Sort by ascending passenger count and descending trip distance
# Average price paid for the top 50 rides

(
    df
    .sort_values(
        by=["passenger_count", "trip_distance"], 
        ascending=[True, False])[:50]["total_amount"]
    .mean()
)

344 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
(
    df
    .sort(by=["passenger_count", "trip_distance"], descending=[False, True])[:50]
    .select(pl.col("total_amount"))
    .mean()
)

257 ms ± 4.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
%%timeit
# In which five rides did people pay the most per mile? 
# How far did people go on those trips?
(
    df
    .replace(0, np.nan)
    .dropna()
    .assign(cost_per_mile=df["total_amount"]/df["trip_distance"])
    .sort_values(by="cost_per_mile", ascending=False)[:5]
    [["trip_distance", "cost_per_mile"]]
)

1.17 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%timeit
(
    df.filter((pl.col("total_amount") != 0) & (pl.col("trip_distance") != 0))
    .with_columns(
        (pl.col("total_amount")/pl.col("trip_distance")).alias("cost_per_mile"))
    .sort(by="cost_per_mile", descending=True)[:5]
    .select("trip_distance", "cost_per_mile")
)

197 ms ± 4.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%%timeit
# Assume that multipassenger rides are split evenly among the passengers.
# In which 10 multipassenger rides did each individual pay the greatest amount?

(
    df[(df['total_amount'] != 0) & (df['passenger_count'] != 0)]
    .assign(cost_per_passenger=df["total_amount"]/df["passenger_count"])
    .sort_values(by="cost_per_passenger", ascending=False)[:10]
)

1.01 s ± 24.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
(
    df.filter((pl.col("total_amount") != 0) & (pl.col("passenger_count") != 0))
    .with_columns(
        (pl.col("total_amount")/pl.col("passenger_count")).alias("cost_per_passenger"))
    .sort(by="cost_per_passenger", descending=True)[:10]
)

182 ms ± 5.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
df=None

In [38]:
import pyarrow as pa
import pyarrow.csv
import pyarrow.compute

In [None]:
co = pa.csv.ConvertOptions(include_columns=['passenger_count', 'trip_distance', 'total_amount'])
df = pa.csv.read_csv('data/nyc_taxi_2019-01.csv', convert_options=co)


In [None]:
%%timeit
pa.compute.mean(
    pa.compute.take(
    df,
    pa.compute.sort_indices(df, sort_keys=[("trip_distance", "descending")])
    )["total_amount"][:20]
)

1.14 s ± 27.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Part 2

In [8]:
df = pd.read_csv('data/nyc_taxi_2019-01.csv', 
                 usecols=['passenger_count', 'trip_distance', 'total_amount'])

In [25]:
df = pl.read_csv('data/nyc_taxi_2019-01.csv', 
                 columns=['passenger_count', 'trip_distance', 'total_amount'])

In [39]:
co = pa.csv.ConvertOptions(include_columns=['passenger_count', 'trip_distance', 'total_amount'])
df = pa.csv.read_csv('data/nyc_taxi_2019-01.csv', convert_options=co)

In [22]:
%%timeit
# For each number of passengers, find the mean cost of a taxi ride. Sort this
# result from lowest (i.e., cheapest) to highest (i.e., most expensive)

(
    df.groupby("passenger_count")
    .agg(mean_cost=("total_amount", "mean"))
    .sort_values(by="mean_cost", ascending=True)
)

88.1 ms ± 153 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
(
    df.group_by(pl.col("passenger_count"))
    .agg(pl.col("total_amount").mean().alias("mean_cost"))
    .sort(by="mean_cost")
)

14.9 ms ± 631 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
%%timeit
(
    df.group_by("passenger_count")
    .aggregate([("total_amount", "mean")])
    .sort_by([("total_amount_mean", "descending")])
)

8.39 ms ± 31.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
%%timeit
# Sort again by increasing the number of passengers
(
    df.groupby("passenger_count")
    .agg(mean_cost=("total_amount", "mean"))
    .sort_values(by=["passenger_count"], ascending=True)
)

88.9 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
(
    df.group_by(pl.col("passenger_count"))
    .agg(pl.col("total_amount").mean().alias("mean_cost"))
    .sort(by="passenger_count")
)

14.4 ms ± 311 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
%%timeit
(
    df.group_by("passenger_count")
    .aggregate([("total_amount", "mean")])
    .sort_by([("passenger_count", "descending")])
)

8.67 ms ± 386 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
# Create a new column, trip_distance_group in which the values are short (< 2
# miles), medium ( 2 miles and 10 miles), and long (> 10 miles). What is the
# average number of passengers per trip length category? Sort this result from
# highest (most passengers) to lowest (fewest passengers)

(
    df
    .assign(
        trip_distance_group = pd.cut(df['trip_distance'], 
        bins=[0, 2, 10, float("inf")], 
        labels=['short', 'medium', 'long'], 
        include_lowest=True))
    .groupby("trip_distance_group", observed=True)
    .agg(mean_pass=("passenger_count", "mean"))
    .sort_values(by="mean_pass", ascending=False)
 )
 

201 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
%%timeit
(
    df.with_columns(
        pl.col("trip_distance")
        .cut([2, 10], labels=["short", "medium", "long"])
        .alias("trip_distance_group"))
    .group_by(pl.col("trip_distance_group"))
    .agg(pl.col("passenger_count").mean().alias("mean_pass"))
    .sort(by="mean_pass")
)

294 ms ± 5.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
