In [1]:
# Load libraries
import pandas as pd
import pyspark
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType

## Week 5 Homework

In this homework we'll put what we learned about Spark in practice.

For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz)

## Questions

### Question 1:

**Install Spark and PySpark**

- Install Spark [OK]
- Run PySpark [OK]
- Create a local spark session [OK]
- Execute spark.version [OK]

In [2]:
# create a spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [3]:
pyspark.__version__

'3.5.0'

In [4]:
spark.version

'3.5.0'

### Question 2:

**FHV October 2019**

Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons.

Repartition the Dataframe to 6 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

- 1MB
- 6MB
- 25MB
- 87MB

R= **6MB**

In [5]:
# Load data with pandas for obtain the schema
pdf = pd.read_csv('data/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz', compression='gzip', nrows=1000)

In [6]:
# dtypes
pdf.dtypes

dispatching_base_num       object
pickup_datetime            object
dropOff_datetime           object
PUlocationID              float64
DOlocationID              float64
SR_Flag                   float64
Affiliated_base_number     object
dtype: object

In [7]:
# first 3 elements
pdf.head(3)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264.0,264.0,,B00009
1,B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264.0,264.0,,B00013
2,B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264.0,264.0,,B00014


In [8]:
# Get the schema
spark.createDataFrame(pdf).schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', DoubleType(), True), StructField('DOlocationID', DoubleType(), True), StructField('SR_Flag', DoubleType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [9]:
# fhv schema
fhv_schema = StructType([
    StructField('dispatching_base_num', StringType(), True), 
    StructField('pickup_datetime', TimestampType(), True), 
    StructField('dropOff_datetime', TimestampType(), True), 
    StructField('PUlocationID', IntegerType(), True), 
    StructField('DOlocationID', IntegerType(), True), 
    StructField('SR_Flag', DoubleType(), True), 
    StructField('Affiliated_base_number', StringType(), True)
])

In [10]:
# load csv data
fhv_df = spark.read \
    .option("header", "true") \
    .schema(fhv_schema) \
    .csv('data/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz')

In [11]:
# Schema
fhv_df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: double (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [12]:
# Repartition the Dataframe to 6 partitions and save it to parquet
fhv_df \
    .repartition(6) \
    .write.parquet('data/pq/fhv/2019/10', mode="overwrite")

In [19]:
!ls -lh data/pq/fhv/2019/10

total 39M
-rw-r--r-- 1 kiramishima 197609    0 Feb 26 00:49 _SUCCESS
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00000-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00001-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00002-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00003-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00004-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet
-rw-r--r-- 1 kiramishima 197609 6.4M Feb 26 00:49 part-00005-d136129f-f205-451c-93d1-d1badffbdc33-c000.snappy.parquet


### Question 3:

**Count records**

How many taxi trips were there on the 15th of October?

Consider only trips that started on the 15th of October.

- 108,164
- 12,856
- 452,470
- 62,610

R= **62610**

In [20]:
# Read data from parquet
df_fhv = spark.read.parquet('data/pq/fhv/2019/10')

In [21]:
# check schema
df_fhv.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: double (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [22]:
# new columns
df_fhv = df_fhv \
    .withColumn('pickup_date', F.to_date(df_fhv['pickup_datetime'])) \
    .withColumn('dropoff_date', F.to_date(df_fhv['dropOff_datetime']))

In [23]:
df_fhv.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: double (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)



In [24]:
df_fhv \
    .filter(df_fhv['pickup_date'] == F.lit('2019-10-15') ) \
    .count()

62610

In [26]:
# Alternative
df_fhv.createOrReplaceTempView('trips_data')

In [27]:
spark.sql("""
SELECT
    COUNT(1) AS total_records
FROM trips_data
WHERE pickup_date == '2019-10-15'
""").show()

+-------------+
|total_records|
+-------------+
|        62610|
+-------------+



### Question 4:

**Longest trip for each day**

What is the length of the longest trip in the dataset in hours?

- 631,152.50 Hours
- 243.44 Hours
- 7.68 Hours
- 3.32 Hours

R= **631,152.50**

In [28]:
df_fhv.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------+------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|pickup_date|dropoff_date|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-----------+------------+
|              B01843|2019-10-29 16:49:00|2019-10-29 20:29:00|         264|         264|   NULL|                  NULL| 2019-10-29|  2019-10-29|
|              B01559|2019-10-24 08:37:30|2019-10-24 08:55:30|         264|         264|   NULL|                B01559| 2019-10-24|  2019-10-24|
|              B00911|2019-10-18 15:43:00|2019-10-18 17:50:37|         264|         244|   NULL|                B00911| 2019-10-18|  2019-10-18|
|              B02795|2019-10-26 11:06:00|2019-10-26 12:20:39|          39|          49|   NULL|                B02795| 2019-10-26

In [29]:
df_fhv2 = df_fhv \
    .withColumn('trip_duration', \
        (F.col("dropOff_datetime").cast("long") - \
        F.col('pickup_datetime').cast("long")) / 3600)

In [30]:
df_fhv2 \
    .select('trip_duration') \
    .orderBy(F.desc('trip_duration')) \
    .show(5)

+-----------------+
|    trip_duration|
+-----------------+
|         631153.5|
|         631152.5|
|87672.44083333333|
|70129.02805555555|
|           8794.0|
+-----------------+
only showing top 5 rows



In [31]:
df_fhv2.createOrReplaceTempView('trips_data_duration')

In [32]:
spark.sql("""
SELECT MAX(trip_duration)
FROM trips_data_duration
""").show()

# The answers don't have 631153.5, so, the correct answer is 631152.5

+------------------+
|max(trip_duration)|
+------------------+
|          631153.5|
+------------------+



### Question 5:

**User Interface**

Spark’s User Interface which shows the application's dashboard runs on which local port?

- 80
- 443
- 4040
- 8080

R= **4040**

### Question 6:

**Least frequent pickup location zone**

Load the zone lookup data into a temp view in Spark  
[Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv)

Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?  

- East Chelsea
- Jamaica Bay
- Union Sq
- Crown Heights North

R= **Jamaica Bay**

In [33]:
zdf = pd.read_csv('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv')
spark.createDataFrame(zdf).schema

StructType([StructField('LocationID', LongType(), True), StructField('Borough', StringType(), True), StructField('Zone', StringType(), True), StructField('service_zone', StringType(), True)])

In [34]:
# Zones Schema
zones_schema = StructType([
    StructField('LocationID', IntegerType(), True), 
    StructField('Borough', StringType(), True), 
    StructField('Zone', StringType(), True), 
    StructField('service_zone', StringType(), True)
])

In [35]:
# load csv data
df_zones = spark.read \
    .option("header", "true") \
    .schema(zones_schema) \
    .csv('data/raw/zones/taxi_zone_lookup.csv')

In [36]:
# create temp view
df_zones.createTempView('zones_lookup')

In [37]:
# Show data
spark.sql("""
SELECT * FROM zones_lookup
LIMIT 10
""").show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
+----------+-------------+--------------------+------------+



In [38]:
# Solution 1
spark.sql("""
SELECT
    pickup_location.Zone,
    COUNT(1)
FROM trips_data
INNER JOIN zones_lookup AS pickup_location
    ON trips_data.PUlocationID = pickup_location.LocationID
GROUP BY 1
ORDER BY 2 ASC
""").show()

+--------------------+--------+
|                Zone|count(1)|
+--------------------+--------+
|         Jamaica Bay|       1|
|Governor's Island...|       2|
| Green-Wood Cemetery|       5|
|       Broad Channel|       8|
|     Highbridge Park|      14|
|        Battery Park|      15|
|Saint Michaels Ce...|      23|
|Breezy Point/Fort...|      25|
|Marine Park/Floyd...|      26|
|        Astoria Park|      29|
|    Inwood Hill Park|      39|
|       Willets Point|      47|
|Forest Park/Highl...|      53|
|  Brooklyn Navy Yard|      57|
|        Crotona Park|      62|
|        Country Club|      77|
|     Freshkills Park|      89|
|       Prospect Park|      98|
|     Columbia Street|     105|
|  South Williamsburg|     110|
+--------------------+--------+
only showing top 20 rows



In [40]:
# Solution 2: Join fhv_df with df_zones
fhv_zones_df = fhv_df.join(df_zones, fhv_df.PUlocationID == df_zones.LocationID)
fhv_zones_df.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+---------------+------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|LocationID|Borough|           Zone|service_zone|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+---------------+------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   NULL|                B00009|       264|Unknown|             NV|         N/A|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   NULL|                B00013|       264|Unknown|             NV|         N/A|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   NULL|                B00014|       264|Unknown|             NV|      

In [41]:
fhv_zones_df.createOrReplaceTempView('fhv_zones')

In [43]:
spark.sql("""
SELECT Zone, count(*)
FROM fhv_zones
GROUP BY Zone
ORDER BY 2 ASC
""").show()

+--------------------+--------+
|                Zone|count(1)|
+--------------------+--------+
|         Jamaica Bay|       1|
|Governor's Island...|       2|
| Green-Wood Cemetery|       5|
|       Broad Channel|       8|
|     Highbridge Park|      14|
|        Battery Park|      15|
|Saint Michaels Ce...|      23|
|Breezy Point/Fort...|      25|
|Marine Park/Floyd...|      26|
|        Astoria Park|      29|
|    Inwood Hill Park|      39|
|       Willets Point|      47|
|Forest Park/Highl...|      53|
|  Brooklyn Navy Yard|      57|
|        Crotona Park|      62|
|        Country Club|      77|
|     Freshkills Park|      89|
|       Prospect Park|      98|
|     Columbia Street|     105|
|  South Williamsburg|     110|
+--------------------+--------+
only showing top 20 rows

