In [None]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Create SparkSession in local mode
spark = SparkSession.builder \
    .appName("NYC Taxi Data") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
df = spark.read.parquet("/home/ariso/Documents/bigDataAssignment/NYC_Taxi_Trip_Data_Analysis/data/rawData")


In [None]:
df.printSchema()
df.show(5)
print("Total rows:", df.count())

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------

In [None]:
required_cols = [
    "tpep_pickup_datetime", "tpep_dropoff_datetime",
    "fare_amount", "PULocationID", "DOLocationID",
    "trip_distance", "passenger_count"
]

df_cleaned = df.dropna(subset=required_cols)

In [None]:
print("Total rows:", df_cleaned.count())



Total rows: 37078488


                                                                                

In [None]:
df_cleaned.select("fare_amount").describe().show()

#to see min max and get idea of outliers



+-------+-----------------+
|summary|      fare_amount|
+-------+-----------------+
|  count|         37078488|
|   mean|19.20668875062649|
| stddev|80.71790255018425|
|    min|          -2261.2|
|    max|        335544.44|
+-------+-----------------+



                                                                                

In [None]:
from pyspark.sql.functions import col, unix_timestamp

df_cleaned = df_cleaned.filter(
    (col("fare_amount") > 3) & #the lowest base fare 
    (col("fare_amount") < 400)
)
df_cleaned.count()
df_cleaned.select("fare_amount").describe().show()



+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          36270722|
|   mean| 19.95070509155673|
| stddev|18.486513247902597|
|    min|              3.01|
|    max|            399.99|
+-------+------------------+



                                                                                

In [None]:
df_cleaned.select("trip_distance").describe().show()



+-------+-----------------+
|summary|    trip_distance|
+-------+-----------------+
|  count|         36270722|
|   mean|3.516674062622483|
| stddev|86.26801177718178|
|    min|              0.0|
|    max|        160244.91|
+-------+-----------------+



                                                                                

In [None]:
df_cleaned = df_cleaned.filter(
  (col("trip_distance") > 0.62) & (col("trip_distance") < 120)
)
df_cleaned.count()
df_cleaned.select("trip_distance").describe().show()



+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|          32879037|
|   mean|3.7118411993049785|
| stddev|  4.66060959861165|
|    min|              0.63|
|    max|            119.83|
+-------+------------------+



                                                                                

In [None]:
df_cleaned.select("passenger_count").describe().show()



+-------+------------------+
|summary|   passenger_count|
+-------+------------------+
|  count|          32879037|
|   mean|1.3374203143480146|
| stddev|  0.81966492060485|
|    min|                 0|
|    max|                 9|
+-------+------------------+



                                                                                

In [None]:

df_cleaned = df_cleaned.filter(
  (col("passenger_count") > 0) & (col("passenger_count") <= 4)
)
df_cleaned.count()
df_cleaned.select("passenger_count").describe().show()



+-------+------------------+
|summary|   passenger_count|
+-------+------------------+
|  count|          32045905|
|   mean|1.2890427965757247|
| stddev|0.6403213072011842|
|    min|                 1|
|    max|                 4|
+-------+------------------+



                                                                                

In [None]:
df_cleaned = df_cleaned.filter(
  col("tpep_dropoff_datetime") > col("tpep_pickup_datetime")
)
df_cleaned.count()

                                                                                

32044504

In [None]:
df_cleaned.select("PULocationID").describe().show()



+-------+-----------------+
|summary|     PULocationID|
+-------+-----------------+
|  count|         32044504|
|   mean|164.3305944133197|
| stddev|63.16955579303938|
|    min|                1|
|    max|              265|
+-------+-----------------+



                                                                                

In [None]:
df_cleaned.select("DOLocationID").describe().show()



+-------+------------------+
|summary|      DOLocationID|
+-------+------------------+
|  count|          32044504|
|   mean|163.54524644850176|
| stddev| 69.75035600012627|
|    min|                 1|
|    max|               265|
+-------+------------------+



                                                                                

In [None]:
df_cleaned.select("tolls_amount").describe().show()



+-------+------------------+
|summary|      tolls_amount|
+-------+------------------+
|  count|          32044504|
|   mean|0.6783849951268689|
| stddev|2.3813431464706625|
|    min|               0.0|
|    max|           1702.88|
+-------+------------------+



                                                                                

In [None]:
df_cleaned = df_cleaned.filter(
    (col("tolls_amount") >= 0) &
    (col("tolls_amount") < 100)
)
df_cleaned.count()

                                                                                

32044478

In [None]:
df_cleaned.select("tip_amount").describe().show()



+-------+------------------+
|summary|        tip_amount|
+-------+------------------+
|  count|          32044478|
|   mean|3.8413911510761305|
| stddev|4.1839528485615585|
|    min|               0.0|
|    max|            999.99|
+-------+------------------+



                                                                                

In [None]:
df_cleaned = df_cleaned.filter(
    (col("tip_amount") >= 0) &
    (col("tip_amount") < 50)
)

In [None]:
df_cleaned = df_cleaned.filter(
    (col("total_amount") > 0) &
    (col("total_amount") < 500)
)
df_cleaned.count()

                                                                                

32036273

In [None]:
# Save cleaned data
df_cleaned.write.mode("overwrite").parquet("/home/ariso/Documents/BData/cleanedData")

# Stop Spark session
spark.stop()
