In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType


In [2]:
spark = (SparkSession.builder
                    .appName('project2')
                    .getOrCreate()
        )

In [3]:
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("total_amount", DoubleType(), True)
])

In [4]:
#original, unmodified dataset
rides_df = (spark.read
            .schema(schema)
            .csv("input/minified_sorted_data.csv") #path
            )

In [5]:
rides_df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [6]:
# remove rows with null values
initial_count = rides_df.count()
rides_df = rides_df.dropna()

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 30 lines


In [7]:
# check if medallion and hash license are valid md5, remove the row otherwise

MD5_PATTERN = r"^[a-fA-F0-9]{32}$"

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("medallion").rlike(MD5_PATTERN) &
    col("hack_license").rlike(MD5_PATTERN)
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 0 lines


In [8]:
# remove rows with invalid pickup of dropoff times

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("dropoff_datetime") > col("pickup_datetime")
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 17848 lines


In [9]:
# check for illogical numeric values

initial_count = rides_df.count()

rides_df = rides_df.filter(
    (col("trip_time_in_secs") > 0) &
    (col("trip_distance") > 0) &
    (col("fare_amount") >= 0) &
    (col("surcharge") >= 0) &
    (col("mta_tax") >= 0) &
    (col("tip_amount") >= 0) &
    (col("tolls_amount") >= 0)
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 31626 lines


In [10]:
# remove lines with invalid fare calculation, eg where total_amount does not equal the sum of all fees

initial_count = rides_df.count()

rides_df = rides_df.filter(
    col("total_amount") ==
    (
        col("fare_amount") + 
        col("surcharge") + 
        col("mta_tax") + 
        col("tip_amount") + 
        col("tolls_amount")
    )
)

print(f"Removed {initial_count - rides_df.count()} lines")

Removed 135788 lines


In [11]:
rides_df.count()

7314708