In [42]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [43]:
sdf_yellow_2022_11 = spark.read.parquet('../data/yellow_2022_11.parquet')

In [44]:
sdf.show(30, vertical=False, truncate=False)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2022-11-01 00:51:22 |2022-11-01 00:56:24  |1.0            |0.6          |1.0       |N                 |151         |151         |2           |4.5        |0.5  |0.5    |0.0      

In [45]:
sdf_yellow_2022_11.printSchema()


root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [46]:
sdf_yellow_2022_11.count()

3252717

In [56]:
sdf_yellow_2022_11.describe().limit(25)


                                                                                

summary,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,3252717.0,3130759.0,3252717.0,3130759.0,3130759,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3252717.0,3130759.0,3130759.0
mean,1.7780406349522568,1.392939220169933,6.346791663707013,1.3670432633109095,,166.3048928019253,163.84076266087703,1.186218167765594,15.159420884146163,0.8339995179414624,0.4868340990009275,2.845867221156059,0.57589755579959,0.2952756111439574,22.0699391888512,2.282593693733692,0.1055331151327841
stddev,0.4477424705273056,0.9331503742908472,654.6889397367644,5.203414537904876,,64.68464062996028,69.78416570825458,0.5433899017125433,14.56426273845912,1.1523850651845435,0.1009482668837168,3.313788963801358,2.1340756707827007,0.0525978529455015,18.221865467737025,0.7589888672951636,0.3517424773732697
min,1.0,0.0,0.0,1.0,N,1.0,1.0,0.0,-1274.0,-5.5,-0.5,-81.6,-58.25,-0.3,-1277.8,-2.5,-1.25
max,6.0,9.0,305756.36,99.0,Y,265.0,265.0,5.0,1274.0,14.35,16.55,333.0,655.55,1.0,1277.8,2.5,1.25


In [48]:
from pyspark.sql.functions import col, sum

sdf_yellow_2022_11.select([sum(col(c).isNull().cast("int")).alias(c) for c in sdf_yellow_2022_11.columns]).limit(25)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,0,0,121958,0,121958,121958,0,0,0,0,0,0,0,0,0,0,121958,121958


In [59]:
# Remove all rows with any null values
sdf_yellow_2022_11 = sdf_yellow_2022_11.dropna()

In [60]:
print('Unique values in each column')
for column in sdf_yellow_2022_11.columns:
    distinct_count = sdf_yellow_2022_11.select(column).distinct().count()
    print(f"{column}: {distinct_count}")


Unique values in each column


                                                                                

VendorID: 2


                                                                                

tpep_pickup_datetime: 1600702


                                                                                

tpep_dropoff_datetime: 1599628


                                                                                

passenger_count: 10


                                                                                

trip_distance: 4313


                                                                                

RatecodeID: 7


                                                                                

store_and_fwd_flag: 2


                                                                                

PULocationID: 260


                                                                                

DOLocationID: 261


                                                                                

payment_type: 5


                                                                                

fare_amount: 2635


                                                                                

extra: 65


                                                                                

mta_tax: 6


                                                                                

tip_amount: 3547


                                                                                

tolls_amount: 746


                                                                                

improvement_surcharge: 4


                                                                                

total_amount: 11922


                                                                                

congestion_surcharge: 5




airport_fee: 3


                                                                                

In [64]:
sdf_yellow_2022_11.groupBy('VendorID').count().orderBy('count', ascending=True).limit(25)


                                                                                

VendorID,count
1,707300
2,2423459


In [68]:
sdf_yellow_2022_11.groupBy('passenger_count').count().orderBy('count', ascending=False).limit(25)

                                                                                

passenger_count,count
1.0,2334176
2.0,483229
3.0,122423
4.0,57386
5.0,53592
0.0,47373
6.0,32533
7.0,27
8.0,17
9.0,3


In [69]:
sdf_yellow_2022_11.groupBy('RatecodeID').count().orderBy('count', ascending=False).limit(25)

                                                                                

RatecodeID,count
1.0,2945260
2.0,132584
5.0,31064
3.0,9026
99.0,8797
4.0,4007
6.0,21


In [70]:
sdf_yellow_2022_11.groupBy('mta_tax').count().orderBy('count', ascending=False).limit(25)

                                                                                

mta_tax,count
0.5,3069652
0.0,37145
-0.5,23955
3.3,5
16.55,1
3.0,1


In [71]:
sdf_yellow_2022_11.groupBy('improvement_surcharge').count().orderBy('count', ascending=False).limit(25)

                                                                                

improvement_surcharge,count
0.3,3104411
-0.3,24734
0.0,1601
1.0,13


In [72]:
sdf_yellow_2022_11.groupBy('airport_fee').count().orderBy('count', ascending=False).limit(25)

                                                                                

airport_fee,count
0.0,2860542
1.25,267268
-1.25,2949


In [79]:
# Filter out rows where RatecodeID is outside the range 1 to 6
df_filtered = sdf_yellow_2022_11.filter((col("RatecodeID") >= 1) & (col("RatecodeID") <= 6))

# Filter out rows where mta_tax is neither 0 nor 0.5
df_filtered = df_filtered.filter((col("mta_tax") == 0) | (col("mta_tax") == 0.5))

# Filter out rows where improvement_surcharge is neither 0.0 nor 0.3
df_filtered = df_filtered.filter((col("improvement_surcharge") == 0.0) | (col("improvement_surcharge") == 0.3))

# Filter out rows where airport_fee is neither 0.0 nor 1.25
df_filtered = df_filtered.filter((col("airport_fee") == 0.0) | (col("airport_fee") == 1.25))

# Show the resulting DataFrame
df_filtered.describe().limit(25)

                                                                                

summary,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,3097208.0,3097208.0,3097208.0,3097208.0,3097208,3097208.0,3097208.0,3097208.0,3097208.0,3097208.0,3097208.0,3097208.0,3097208.0,3097208,3097208.0,3097208.0,3097208.0
mean,1.774474623596478,1.3944494525391902,3.6104518488905777,1.0910804182347456,,166.53383886390583,164.30299837789389,1.21619794343809,15.081872157117648,0.876585624859551,0.4941397219689475,2.848891010870782,0.5710938367740269,0.2998516083028367,22.06694750192032,2.323452364193816,0.107866100694561
stddev,0.417927909328277,0.9354457793072846,63.1888193056804,0.4628578945029232,,64.22490972102952,69.67629340765417,0.4592950010422423,14.13479805360274,1.1620632342370278,0.0538126109001875,3.303089385112848,2.108685853447565,0.006670495341739...,17.7518531287869,0.6404726845501721,0.350995113877848
min,1.0,0.0,0.0,1.0,N,1.0,1.0,1.0,-95.0,-1.0,0.0,0.0,0.0,0.0,-97.5,-2.5,0.0
max,2.0,9.0,103319.46,6.0,Y,265.0,265.0,4.0,1274.0,14.35,0.5,333.0,655.55,0.3,1277.8,2.5,1.25


24/08/04 03:37:12 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2118760 ms exceeds timeout 120000 ms
24/08/04 03:37:12 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/04 03:37:15 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$