In [1]:
import pandas as pd
import numpy as np
import polars as pl


In [40]:
df = pd.read_csv('data/nyc_taxi_2019-01.csv', 
                 usecols=['passenger_count', 'trip_distance', 'total_amount'])

In [15]:
df = pl.read_csv('data/nyc_taxi_2019-01.csv', 
                 columns=['passenger_count', 'trip_distance', 'total_amount'])

In [41]:
%%timeit
# the average cost of the 20 longest (in distance) taxirides in January 2019 (descending)
(
    df
    .sort_values(by=["trip_distance"], ascending=False)[:20]["total_amount"]
    .mean()
)

748 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit
(
    df
    .sort(by=["trip_distance"], descending=True)[:20]
    .select(pl.col("total_amount"))
    .mean()
)

178 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [42]:
%%timeit
# the average cost of the 20 longest (in distance) taxirides in January 2019 (ascending)
(
    df
    .sort_values(by=["trip_distance"])[-20:]["total_amount"]
    .mean()
)

719 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
(
    df
    .sort(by=["trip_distance"], descending=False)[-20:]
    .select(pl.col("total_amount"))
    .mean()
)

173 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
%%timeit
# Sort by ascending passenger count and descending trip distance
# Average price paid for the top 50 rides

(
    df
    .sort_values(
        by=["passenger_count", "trip_distance"], 
        ascending=[True, False])[:50]["total_amount"]
    .mean()
)

344 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
(
    df
    .sort(by=["passenger_count", "trip_distance"], descending=[False, True])[:50]
    .select(pl.col("total_amount"))
    .mean()
)

257 ms ± 4.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
%%timeit
# In which five rides did people pay the most per mile? 
# How far did people go on those trips?
(
    df
    .replace(0, np.nan)
    .dropna()
    .assign(cost_per_mile=df["total_amount"]/df["trip_distance"])
    .sort_values(by="cost_per_mile", ascending=False)[:5]
    [["trip_distance", "cost_per_mile"]]
)

1.17 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
%%timeit
(
    df.filter((pl.col("total_amount") != 0) & (pl.col("trip_distance") != 0))
    .with_columns(
        (pl.col("total_amount")/pl.col("trip_distance")).alias("cost_per_mile"))
    .sort(by="cost_per_mile", descending=True)[:5]
    .select("trip_distance", "cost_per_mile")
)

196 ms ± 6.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%%timeit
# Assume that multipassenger rides are split evenly among the passengers.
# In which 10 multipassenger rides did each individual pay the greatest amount?

(
    df[(df['total_amount'] != 0) & (df['passenger_count'] != 0)]
    .assign(cost_per_passenger=df["total_amount"]/df["passenger_count"])
    .sort_values(by="cost_per_passenger", ascending=False)[:10]
)

1.01 s ± 24.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit
(
    df.filter((pl.col("total_amount") != 0) & (pl.col("passenger_count") != 0))
    .with_columns(
        (pl.col("total_amount")/pl.col("passenger_count")).alias("cost_per_passenger"))
    .sort(by="cost_per_passenger", descending=True)[:10]
)

187 ms ± 4.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
