In [None]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet

# Q1: Install Spark and PySpark

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

What's the output?

In [1]:
import pyspark

pyspark.__version__

'4.1.1'

# Q2: Yellow October 2024

Read the October 2024 Yellow into a Spark Dataframe.

Repartition the Dataframe to 4 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

df = spark.read \
    .option("header", "true") \
    .parquet("yellow_tripdata_2024-10.parquet")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/19 12:39:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
df.show()



+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2024-10-07 16:40:43|  2024-10-07 18:10:56|              1|         14.8|        99|                 N|         127|         225|           1|       47.5|  0.0|    0.5|       0.

                                                                                

In [6]:
df=df.repartition(4)

df.write.parquet("yellow/2024/10/")

                                                                                

In [7]:
!du -h yellow/2024/10/*

  0B	yellow/2024/10/_SUCCESS
 23M	yellow/2024/10/part-00000-13ebd7e4-526a-402d-b0b0-b8d80fbfb0fb-c000.snappy.parquet
 23M	yellow/2024/10/part-00001-13ebd7e4-526a-402d-b0b0-b8d80fbfb0fb-c000.snappy.parquet
 23M	yellow/2024/10/part-00002-13ebd7e4-526a-402d-b0b0-b8d80fbfb0fb-c000.snappy.parquet
 23M	yellow/2024/10/part-00003-13ebd7e4-526a-402d-b0b0-b8d80fbfb0fb-c000.snappy.parquet


# Q3: Count records 

How many taxi trips were there on the 15th of October?

Consider only trips that started on the 15th of October.

- 85,567
- 105,567
- 125,567
- 145,567

In [11]:
from pyspark.sql import functions as F

df \
    .withColumn('pickup_date', F.to_date(df.tpep_pickup_datetime)) \
    .filter(F.col('pickup_date') == '2024-10-15') \
    .select(F.count('pickup_date')) \
    .show()

+------------------+
|count(pickup_date)|
+------------------+
|            128893|
+------------------+



# Q4: Longest trip

What is the length of the longest trip in the dataset in hours?

In [17]:
df \
    .withColumn('diff', 
    (F.unix_timestamp('tpep_dropoff_datetime') - F.unix_timestamp('tpep_pickup_datetime'))/3600) \
    .select("diff") \
    .orderBy(F.col("diff").desc()) \
    .show()



+------------------+
|              diff|
+------------------+
|162.61777777777777|
|           143.325|
|137.76055555555556|
|114.83472222222223|
| 89.89833333333333|
| 89.44611111111111|
| 70.29916666666666|
| 67.57333333333334|
| 66.06666666666666|
|           46.4225|
| 42.30888888888889|
| 38.47416666666667|
| 33.95111111111111|
| 26.29861111111111|
| 25.29138888888889|
|25.238333333333333|
|             24.47|
|23.996666666666666|
|23.995277777777776|
|23.994722222222222|
+------------------+
only showing top 20 rows


                                                                                

# Q5: User Interface

Sparkâ€™s User Interface which shows the application's dashboard runs on which local port?

Answer: `4040`

# Q6: Least frequent pickup location zone

Load the zone lookup data into a temp view in Spark:

```bash
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
```

Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?

In [None]:
df_lookup = spark.read \
            .option("header", "true") \
            .csv("taxi_zone_lookup.csv")

df_lookup.createOrReplaceTempView("zone_lookup")
df.createOrReplaceTempView("trip_data")

df \
    .join(df_lookup, df.PULocationID == df_lookup.LocationID) \
    .groupBy("Zone") \
    .count() \
    .orderBy("count") \
    .show()

+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|Governor's Island...|    1|
|       Rikers Island|    2|
|       Arden Heights|    2|
|         Jamaica Bay|    3|
| Green-Wood Cemetery|    3|
|   Rossville/Woodrow|    4|
|       West Brighton|    4|
|       Port Richmond|    4|
|Eltingville/Annad...|    4|
|Charleston/Totten...|    4|
|         Great Kills|    6|
|        Crotona Park|    6|
|     Mariners Harbor|    7|
|Heartland Village...|    7|
|Saint George/New ...|    9|
|             Oakwood|    9|
|New Dorp/Midland ...|   10|
|       Broad Channel|   10|
|         Westerleigh|   12|
|     Pelham Bay Park|   12|
+--------------------+-----+
only showing top 20 rows
