# Queries

NB! Run `0_data_enrichment.ipynb` first to get the `../output/output.parquet` file!

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, lag, sum, avg, round
from pyspark.sql.window import Window

In [2]:
spark = (
    SparkSession.builder
        .appName('NYC Taxi queries')
        .getOrCreate()
)

In [3]:
df = spark.read.load('../output/output.parquet')

In [5]:
df.show()

+--------------------+--------------+---------------+----------+----------+
|           medallion|pickup_borough|dropoff_borough| pickup_ts|dropoff_ts|
+--------------------+--------------+---------------+----------+----------+
|FFE1AB76511357473...| Staten Island|         Queens|1358097060|1358098260|
|7FA4613170593EB05...|      Brooklyn|      Manhattan|1358064540|1358065560|
|F1EF8290A54338B13...|      Brooklyn|         Queens|1357373760|1357374600|
|590D90160BD7C2673...|      Brooklyn|      Manhattan|1357597260|1357598280|
|70D0498946F93A8DB...|      Brooklyn|      Manhattan|1358071620|1358073420|
|2FEA0AF4B4BDD3A9B...|      Brooklyn|      Manhattan|1358071800|1358073180|
|E520117674DF8949E...|      Brooklyn|         Queens|1358068560|1358070000|
|F5F6C2E6AC24A2DB6...|      Brooklyn|         Queens|1358061060|1358061900|
|B6D017202CE202513...|      Brooklyn|         Queens|1358052060|1358053020|
|47537578B06C874F6...|      Brooklyn|      Manhattan|1358059140|1358060040|
|38010D372ED

In [6]:
# Preprocessing for query 1 and 2

# Order by prickup_ts to get sequential trips
# Group by medallion (per taxi)
window = Window.partitionBy('medallion').orderBy('pickup_ts')

df_ts = (df
         .withColumn('dropoff_ts_prev', lag('dropoff_ts').over(window)) # lag() returns previous window row value*
         .withColumn('idle_time', col('pickup_ts') - col('dropoff_ts_prev'))
         .withColumn('trip_time', col('dropoff_ts') - col('pickup_ts'))
        )

# Remove neg idle_time and idle_time > 4h
HOURS_4 = 4 * 60 * 60 # 14400 seconds
df_ts_wo_exceptions = df_ts.filter((col('idle_time') > 0) & (col('idle_time') <= 14400))

## Query 1

In [7]:
# Group by taxi (using medallion col)
df_medal = df_ts_wo_exceptions.groupBy('medallion').agg(
    avg(col('trip_time')).alias('total_trip_time'),
    avg(col('idle_time')).alias('total_idle_time')
)

# Calculate utilization col
df_util = df_medal.withColumn(
    'utilization', col('total_trip_time') / (col('total_trip_time') + col('total_idle_time'))
)

In [13]:
df_util.show()

+--------------------+-----------------+------------------+-------------------+
|           medallion|  total_trip_time|   total_idle_time|        utilization|
+--------------------+-----------------+------------------+-------------------+
|000318C2E3E638158...|649.0909090909091| 654.5454545454545|0.49790794979079506|
|002E3B405B6ABEA23...|886.6666666666666|1793.3333333333333| 0.3308457711442786|
|0030AD2648D81EE87...|           1440.0|             720.0| 0.6666666666666666|
|0036961468659D0BF...|603.5294117647059|1161.1764705882354|0.34199999999999997|
|0038EF45118925A51...|517.8947368421053| 795.7894736842105| 0.3942307692307693|
|003EEA559FA618008...|643.6363636363636|1756.3636363636363| 0.2681818181818182|
|0053334C798EC6C8E...|833.3333333333334|2493.3333333333335|  0.250501002004008|
|005DED7D6E6C45441...|            895.0|             980.0|0.47733333333333333|
|005F00B38F46E2100...|            715.2|            1687.2| 0.2977022977022977|
|00790C7BAD30B7A9E...|           663.75|

## Query 2

In [20]:
# calculates the average idle time before the next ride per destination borough in seconds
df_time_to_next = df_ts_wo_exceptions.groupBy('dropoff_borough').agg(
    round(avg(col('idle_time')) / 1000, 2).alias('avg_time_for_next_ride_secs')
)

In [21]:
df_time_to_next.show()

+---------------+---------------------------+
|dropoff_borough|avg_time_for_next_ride_secs|
+---------------+---------------------------+
|         Queens|                       2.07|
|       Brooklyn|                       1.96|
|      Manhattan|                       1.15|
|          Bronx|                       2.24|
|  Staten Island|                       1.28|
+---------------+---------------------------+



## Query 3

In [22]:
# exclude columns that have null values and count rows that have same dropoff and pickup borough
c_trips_end_start_loc_same = df.where(
    (col("pickup_borough").isNotNull()) & 
    (col("dropoff_borough").isNotNull()) & 
    (col("pickup_borough") == col("dropoff_borough"))
).count()

In [23]:
c_trips_end_start_loc_same

86074

## Query 4

In [24]:
# exclude null columns with null values and count rows that have different dropoff and pickup borough
c_trips_end_start_loc_not_same = df.where(
    (col("pickup_borough").isNotNull()) & 
    (col("dropoff_borough").isNotNull()) & 
    (col("pickup_borough") != col("dropoff_borough"))
).count()

In [25]:
c_trips_end_start_loc_not_same

11433

## Miscellaneous

In [26]:
print(f"Number of trips with location: {df.count()}")

Number of trips with location: 97507


In [15]:
# validate that number of trips in query 3 and query 4 match the total 
is_count_eq = df.count() == c_trips_end_start_loc_same + c_trips_end_start_loc_not_same
print(f"Q3+Q4 equals total sum? {is_count_eq}")

Q3+Q4 equals total sum? True
