In [4]:
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types

In [5]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

### Question 1:

In [6]:
pyspark.__version__

'3.0.3'

### Question 2:

In [7]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv

--2022-02-26 19:17:51--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.134.41
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.134.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 733822658 (700M) [text/csv]
Saving to: ‘fhvhv_tripdata_2021-02.csv.1’


2022-02-26 19:18:13 (32.3 MB/s) - ‘fhvhv_tripdata_2021-02.csv.1’ saved [733822658/733822658]



In [8]:
fhvhv_schema = types.StructType([
    types.StructField("hvfhs_license_num", types.StringType(), True),
    types.StructField("dispatching_base_num", types.StringType(), True),
    types.StructField("pickup_datetime", types.TimestampType(), True),
    types.StructField("dropoff_datetime", types.TimestampType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("SR_Flag",types.StringType(), True)
])

In [9]:
df_fhvhv = spark.read \
        .option("header", "true") \
        .schema(fhvhv_schema) \
        .csv("fhvhv_tripdata_2021-02.csv")

df_fhvhv \
        .repartition(24) \
        .write.parquet("fhvhv_tripdata_2021-02.parquet", mode="overwrite")

                                                                                

### Question 3:

In [27]:
df_fhvhv.registerTempTable('fhvhv_tripdata')

In [28]:
df_result = spark.sql("""
SELECT 
    COUNT(*) 
FROM 
    fhvhv_tripdata
WHERE
    DATE(pickup_datetime) = '2021-02-15'
""").show()



+--------+
|count(1)|
+--------+
|  367170|
+--------+



                                                                                

### Question 4:

In [53]:
df_fhvhv = df_fhvhv \
    .withColumn("trip_length", F.col("dropoff_datetime").cast("long") - F.col("pickup_datetime").cast("long")) \
    .withColumn("day", F.to_date("dropoff_datetime"))

In [57]:
df=df_fhvhv
    .groupBy("day")
    .max("trip_length")

In [66]:
df.sort(F.col("max(trip_length)").desc()).show()



+----------+----------------+
|       day|max(trip_length)|
+----------+----------------+
|2021-02-12|           75540|
|2021-02-18|           57221|
|2021-02-21|           44039|
|2021-02-04|           40653|
|2021-02-20|           37577|
|2021-02-26|           35010|
|2021-02-19|           34612|
|2021-02-11|           34169|
|2021-02-10|           32476|
|2021-02-25|           32439|
|2021-02-22|           32223|
|2021-02-06|           31447|
|2021-02-02|           30913|
|2021-02-09|           30732|
|2021-02-08|           30106|
|2021-02-14|           29777|
|2021-02-03|           29126|
|2021-02-28|           27170|
|2021-02-23|           26878|
|2021-02-15|           25874|
+----------+----------------+
only showing top 20 rows



                                                                                

### Question 5:

In [71]:
df=df_fhvhv.groupBy("dispatching_base_num").count()

In [72]:
df.sort(F.col("count").desc()).show()



+--------------------+-----+
|dispatching_base_num|count|
+--------------------+-----+
|              B02869|    1|
|              B02876|    1|
|              B02877|    1|
|              B03136|    1|
|              B02883|    1|
|              B02835|    1|
|              B02884|    1|
|              B02880|    1|
|              B02836|    1|
|              B02878|    1|
|              B02512|    1|
|              B02866|    1|
|              B02867|    1|
|              B02872|    1|
|              B02871|    1|
|              B02844|    1|
|              B02889|    1|
|              B02888|    1|
|              B02510|    1|
|              B02682|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

### Question 6:

In [7]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2022-02-25 09:15:19--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.85.29
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.85.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv’


2022-02-25 09:15:20 (46.8 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]



In [17]:
df_zones = spark.read.format("csv") \
                  .load("taxi+_zone_lookup.csv", header='true',)

In [28]:
df_fhvhv.registerTempTable("fhvhv_tripdata")
df_zones.registerTempTable("zone_data")

In [50]:
df_result = spark.sql("""
SELECT 
    CONCAT(pu.Zone, ' / ' , do.Zone) AS `pickup-dropoff`, COUNT(*) AS count
FROM 
    fhvhv_tripdata
JOIN 
    zone_data AS pu
ON 
    PULocationID = pu.LocationID
JOIN 
    zone_data AS do
ON
    DOLocationID = do.LocationID
GROUP BY
    `pickup-dropoff`
ORDER BY 
    COUNT(*) DESC;
""").show()



+--------------------+-----+
|      pickup-dropoff|count|
+--------------------+-----+
|East New York / E...|45041|
|Borough Park / Bo...|37329|
| Canarsie / Canarsie|28026|
|Crown Heights Nor...|25976|
|Bay Ridge / Bay R...|17934|
|Jackson Heights /...|14688|
|   Astoria / Astoria|14688|
|Central Harlem No...|14481|
|Bushwick South / ...|14424|
|Flatbush/Ditmas P...|13976|
|South Ozone Park ...|13716|
|Brownsville / Bro...|12829|
|    JFK Airport / NA|12542|
|Prospect-Lefferts...|11814|
|Forest Hills / Fo...|11548|
|Bushwick North / ...|11491|
|Bushwick South / ...|11487|
|Crown Heights Nor...|11462|
|Crown Heights Nor...|11342|
|Prospect-Lefferts...|11308|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [14]:
df_fhvhv.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']

In [42]:
df = df_fhvhv \
    .join(df_zones, df_fhvhv.PULocationID == df_zones.LocationID) \
    .drop("LocationID", "Borough", "service_zone") \
    .withColumnRenamed("Zone", "pickup_zone")

In [43]:
df = df \
    .join(df_zones, df_fhvhv.DOLocationID == df_zones.LocationID) \
    .drop("LocationID", "Borough", "service_zone") \
    .withColumnRenamed("Zone", "dropoff_zone")

In [45]:
df = df \
    .withColumn("pickup-dropoff", F.concat(F.col("pickup_zone"), F.lit(" / "), F.col("dropoff_zone")))

In [46]:
df.select("pickup-dropoff").show(5)

+--------------------+
|      pickup-dropoff|
+--------------------+
|Brownsville / Can...|
|Canarsie / Browns...|
|Canarsie / Flatlands|
|Flatlands / Sunse...|
|Hunts Point / Wes...|
+--------------------+
only showing top 5 rows



In [47]:
df=df.groupBy("pickup-dropoff").count()

In [48]:
df.show()



+--------------------+-----+
|      pickup-dropoff|count|
+--------------------+-----+
|  Midtown South / NA| 3380|
|Hamilton Heights ...| 1452|
|Hamilton Heights ...|  960|
|Bedford Park / Be...| 5063|
|Bushwick South / ...|  372|
|    Inwood / Norwood|  664|
|East Flatbush/Rem...| 5659|
|Alphabet City / C...|  430|
|Park Slope / Park...| 9771|
|East Harlem South...|  107|
|Homecrest / Grave...| 1522|
|Eastchester / Hun...|  338|
|TriBeCa/Civic Cen...|  566|
|Norwood / Kensington|    4|
|Greenwich Village...|  675|
|Upper West Side N...| 1415|
|Kew Gardens Hills...|  157|
|Long Island City/...| 1487|
|Brownsville / Two...|  160|
|Soundview/Castle ...|   64|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [49]:
df.sort(F.col("count").desc()).show()



+--------------------+-----+
|      pickup-dropoff|count|
+--------------------+-----+
|East New York / E...|45041|
|Borough Park / Bo...|37329|
| Canarsie / Canarsie|28026|
|Crown Heights Nor...|25976|
|Bay Ridge / Bay R...|17934|
|Jackson Heights /...|14688|
|   Astoria / Astoria|14688|
|Central Harlem No...|14481|
|Bushwick South / ...|14424|
|Flatbush/Ditmas P...|13976|
|South Ozone Park ...|13716|
|Brownsville / Bro...|12829|
|    JFK Airport / NA|12542|
|Prospect-Lefferts...|11814|
|Forest Hills / Fo...|11548|
|Bushwick North / ...|11491|
|Bushwick South / ...|11487|
|Crown Heights Nor...|11462|
|Crown Heights Nor...|11342|
|Prospect-Lefferts...|11308|
+--------------------+-----+
only showing top 20 rows



                                                                                