In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *


In [2]:
spark = SparkSession.builder.appName("TLC Cleaning").getOrCreate()


25/07/20 17:38:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
df_yellow = spark.read.parquet("yellow_tripdata_2024-01.parquet")
df_green = spark.read.parquet("green_tripdata_2024-01.parquet")

In [4]:
df_yellow = df_yellow \
    .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
    .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime") \
    .withColumn("taxi_type", lit("yellow"))

df_green = df_green \
    .withColumnRenamed("lpep_pickup_datetime", "pickup_datetime") \
    .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime") \
    .withColumn("taxi_type", lit("green"))

common_cols = list(set(df_yellow.columns).intersection(set(df_green.columns)))
df_yellow = df_yellow.select(common_cols)
df_green = df_green.select(common_cols)

df = df_yellow.unionByName(df_green)



In [5]:
df.printSchema()

root
 |-- DOLocationID: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- taxi_type: string (nullable = false)
 |-- total_amount: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- extra: double (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- fare_amount: double (nullable = true)



In [6]:
df = df.drop('store_and_fwd_flag')

In [7]:
df.count()

3021175

In [8]:
for i in df.columns :
    print(f"=================================== {i} ============================== ")
    df.select(
        sum(when(col(i).isNull(), 1).otherwise(0)).alias("null_count")
    ).show()

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|    143577|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|      3415|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+----------+
|null_count|
+----------+
|    143577|
+----------+

+----------+
|null_count|
+----------+
|         0|
+----------+

+---------

In [9]:
for i in df.columns:
    print(f"================================= {i} ============================================")
    df.groupBy(i).agg(count("*").alias("value_count")).orderBy("value_count", ascending=False).show()

+------------+-----------+
|DOLocationID|value_count|
+------------+-----------+
|         236|     144769|
|         237|     131188|
|         161|     112304|
|         230|      90820|
|         142|      90482|
|         239|      90400|
|         170|      86959|
|         162|      85522|
|         141|      84350|
|         238|      75750|
|          68|      74736|
|         163|      73114|
|          48|      72453|
|         234|      68366|
|         263|      66372|
|         140|      65888|
|         186|      64432|
|         229|      64222|
|         164|      59962|
|          79|      59226|
+------------+-----------+
only showing top 20 rows
+--------------------+-----------+
|congestion_surcharge|value_count|
+--------------------+-----------+
|                 2.5|    2577898|
|                 0.0|     255976|
|                NULL|     143577|
|                -2.5|      28824|
|                2.75|      14890|
|               -2.75|          4|
|           

                                                                                

+-------------------+-----------+
|   dropoff_datetime|value_count|
+-------------------+-----------+
|2024-01-05 00:00:00|         21|
|2024-01-28 00:00:00|         19|
|2024-01-21 00:00:00|         19|
|2024-01-27 00:00:00|         18|
|2024-01-06 00:00:00|         18|
|2024-01-07 00:00:00|         18|
|2024-01-09 00:00:00|         18|
|2024-01-12 00:00:00|         17|
|2024-01-19 00:00:00|         16|
|2024-01-13 00:00:00|         16|
|2024-01-29 00:00:00|         16|
|2024-01-02 00:00:00|         15|
|2024-01-04 00:00:00|         14|
|2024-01-03 00:00:00|         14|
|2024-01-26 00:00:00|         14|
|2024-01-15 00:00:00|         14|
|2024-01-10 00:00:00|         13|
|2024-01-30 00:00:00|         13|
|2024-01-25 00:00:00|         12|
|2024-01-20 23:43:25|         12|
+-------------------+-----------+
only showing top 20 rows
+---------+-----------+
|taxi_type|value_count|
+---------+-----------+
|   yellow|    2964624|
|    green|      56551|
+---------+-----------+

+------------+

                                                                                

+-------------------+-----------+
|    pickup_datetime|value_count|
+-------------------+-----------+
|2024-01-04 21:37:27|         11|
|2024-01-17 18:17:00|         11|
|2024-01-06 22:09:45|         11|
|2024-01-17 16:25:48|         11|
|2024-01-12 15:28:25|         11|
|2024-01-04 18:20:34|         11|
|2024-01-24 18:38:59|         11|
|2024-01-26 22:04:27|         11|
|2024-01-18 20:05:17|         11|
|2024-01-05 11:55:24|         10|
|2024-01-30 17:05:12|         10|
|2024-01-17 18:21:27|         10|
|2024-01-08 19:16:46|         10|
|2024-01-24 19:20:06|         10|
|2024-01-17 17:55:00|         10|
|2024-01-11 18:22:17|         10|
|2024-01-09 18:37:21|         10|
|2024-01-28 12:50:50|         10|
|2024-01-04 18:17:36|         10|
|2024-01-30 15:55:02|         10|
+-------------------+-----------+
only showing top 20 rows
+------------+-----------+
|PULocationID|value_count|
+------------+-----------+
|         132|     145264|
|         161|     143471|
|         237|     14270

In [10]:
before_cleaning=df.count()


In [11]:
#============================================== drop Nulls =========================================================
df = df.dropna(subset=[
    "congestion_surcharge",
    "payment_type",
    "RatecodeID",
    "passenger_count",
   
    
])

In [12]:
#============================================== dropDuplicates =========================================================

df = df.dropDuplicates([
    "pickup_datetime", 
    "dropoff_datetime", 
    "passenger_count", 
    "trip_distance", 
    "PULocationID", 
    "DOLocationID"
])

In [13]:

df = df.withColumn("trip_duration", (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")))
df = df.withColumn("trip_speed_mph", round(col("trip_distance") / (col("trip_duration") / 3600), 2))

In [14]:
#============================================== filter =========================================================

df = df.filter(
    (col("passenger_count") > 0) &
    (col("passenger_count") <= 6) &
    (col("trip_distance") > 0) &
    (col("trip_distance") < 150) &
    (col("fare_amount") >= 0) &
    (col("total_amount") >= 0) &
    (col("tip_amount") >= 0) &
    (col("extra") >= 0) &
    (col("mta_tax") >= 0) &
    (col("improvement_surcharge") >= 0) &
    (col("congestion_surcharge") >= 0) &
    (col("tolls_amount") >= 0) &
    (col("RatecodeID").isin([1, 2, 3, 4, 5, 6])) &  
    (col("payment_type").isin([1, 2, 3, 4, 5, 6])) &
    (col("trip_duration") >= 60) & 
    (col("trip_duration") <= 10800) &
    (col("trip_speed_mph") >= 1) &
    (col("trip_speed_mph") <= 80) &
    (col("PULocationID").between(1, 263)) &
    (col("DOLocationID").between(1, 263))
)


In [15]:
#============================================== Encoding =========================================================
# VendorID => Vendor
df = df.withColumn("Vendor", when(col("VendorID") == 1, "Creative Mobile Technologies")
                             .when(col("VendorID") == 2, "Curb Mobility")
                             .when(col("VendorID") == 6, "Myle Technologies")
                             .when(col("VendorID") == 7, "Helix")
                             .otherwise("Unknown"))

# RatecodeID => RateCode
df = df.withColumn("RateCode", when(col("RatecodeID") == 1, "Standard rate")
                              .when(col("RatecodeID") == 2, "JFK")
                              .when(col("RatecodeID") == 3, "Newark")
                              .when(col("RatecodeID") == 4, "Nassau or Westchester")
                              .when(col("RatecodeID") == 5, "Negotiated fare")
                              .when(col("RatecodeID") == 6, "Group ride")
                              .when(col("RatecodeID") == 99, "Unknown")
                              .otherwise("Unknown"))

# payment_type => PaymentType
df = df.withColumn("PaymentType", when(col("payment_type") == 0, "Flex Fare")
                                 .when(col("payment_type") == 1, "Credit card")
                                 .when(col("payment_type") == 2, "Cash")
                                 .when(col("payment_type") == 3, "No charge")
                                 .when(col("payment_type") == 4, "Dispute")
                                 .when(col("payment_type") == 5, "Unknown")
                                 .when(col("payment_type") == 6, "Voided trip")
                                 .otherwise("Unknown"))




In [16]:
#for i in df.columns:
#    print(f"================================= {i} ============================================")
#    df.groupBy(i).agg(count("*").alias("value_count")).orderBy("value_count", ascending=False).show()

In [17]:
zones = spark.read.csv("taxi_zone_lookup.csv", header=True, inferSchema=True)
zones.show()


+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [18]:
df.createOrReplaceTempView("trips")
zones.createOrReplaceTempView("zones")

df = spark.sql("""
SELECT
    trips.*,
    pu.Borough         AS PUBorough,
    pu.Zone            AS PUZone,
    pu.service_zone    AS PU_service_zone,
    do.Borough         AS DOBorough,
    do.Zone            AS DOZone,
    do.service_zone    AS DO_service_zone
FROM trips
LEFT JOIN zones pu
    ON trips.PULocationID = pu.LocationID
LEFT JOIN zones do
    ON trips.DOLocationID = do.LocationID
""")



In [19]:
#============================================== Add Columns =========================================================
#===================================================================================================================================
df = df.withColumn("pickup_hour", hour(col("pickup_datetime")))
df = df.withColumn(
    "pickup_period",
    when(col("pickup_hour").between(0, 5), "Late Night")
    .when(col("pickup_hour").between(6, 11), "Morning")
    .when(col("pickup_hour").between(12, 16), "Afternoon")
    .when(col("pickup_hour").between(17, 20), "Evening")
    .otherwise("Night")
)
    
df = df.withColumn("day_of_week_name", date_format(col("pickup_datetime"), "E"))
df = df.withColumn("is_weekend", (col("day_of_week_name") == "Fri" ) | (col("day_of_week_name") == "Sat"))



df = df.withColumn("trip_length_category",
                   when(col("trip_duration") < 300, "Very Short")
                   .when((col("trip_duration") >= 300) & (col("trip_duration") < 900), "Short")
                   .when((col("trip_duration") >= 900) & (col("trip_duration") < 1800), "Medium")
                   .otherwise("Long"))

df = df.withColumn("IsWeekendText", 
    when(col("is_weekend") == True, "Weekend")
    .otherwise("Weekday"))
#===================================================================================================================================
# 1. fare_per_mile
df = df.withColumn("fare_per_mile", 
                   when(col("trip_distance") > 0, col("fare_amount") / col("trip_distance")))

# 2. fare_per_minute (duration بالثواني)
df = df.withColumn("fare_per_minute", 
                   when(col("trip_duration") > 0, col("fare_amount") / (col("trip_duration") / 60)))

# 3. tip_ratio
df = df.withColumn("tip_ratio", 
                   when(col("total_amount") > 0, col("tip_amount") / col("total_amount")))

# 4. trip_efficiency (miles per second)
df = df.withColumn("trip_efficiency", 
                   when(col("trip_duration") > 0, col("trip_distance") / col("trip_duration")))

# 6. is_airport_trip (based on pickup/dropoff borough)
df = df.withColumn("is_airport_trip",
                   (col("PUBorough").contains("Airport")) | (col("DOBorough").contains("Airport")))

# 7. is_suspicious (مسافة صغيرة جدًا ومبلغ عالي)
df = df.withColumn("is_suspicious",
                   (col("trip_distance") < 0.2) & (col("total_amount") > 30))

In [20]:
df.printSchema()

root
 |-- DOLocationID: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- taxi_type: string (nullable = false)
 |-- total_amount: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- extra: double (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- trip_duration: long (nullable = true)
 |-- trip_speed_mph: double (nullable = true)
 |-- Vendor: string (nullable = false)
 |-- RateCode: string (nullable = false)
 |-- PaymentType: 

In [21]:
after_cleaning=df.count()


25/07/20 17:38:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [22]:
print((after_cleaning / before_cleaning)*100 )

88.79204283101774


In [23]:
#===============================================================================================================
#===============================================================================================================
#===============================================================================================================
#====================================== Modeling ===============================================================
#===============================================================================================================
#===============================================================================================================
#===============================================================================================================
#===============================================================================================================
#===============================================================================================================

In [24]:
dim_vendor = df.select("VendorID", "Vendor").dropDuplicates()

In [25]:
dim_rate_code = df.select("RatecodeID", "RateCode").dropDuplicates()

In [26]:
dim_payment_type = df.select("payment_type", "PaymentType").dropDuplicates()

In [27]:
dim_pickup_location = df.select(
    col("PULocationID"),
    col("PUBorough"),
    col("PUZone"),
    col("PU_service_zone")
).dropDuplicates()


In [28]:
dim_dropoff_location = df.select(
    col("DOLocationID"),
    col("DOBorough"),
    col("DOZone"),
    col("DO_service_zone")
).dropDuplicates()


In [29]:
dim_time = df.select(
    col("pickup_datetime").alias("pickup_datetime"),
    year("pickup_datetime").alias("year"),
    month("pickup_datetime").alias("month"),
    dayofmonth("pickup_datetime").alias("day"),
    hour("pickup_datetime").alias("hour"),
    col("pickup_hour"),
    col("pickup_period"),
    col("day_of_week_name"),
    col("is_weekend"),
    col("IsWeekendText")
).dropDuplicates()

dim_time = dim_time.withColumn("time_id", monotonically_increasing_id())

In [30]:
dim_trip_category = df.select(
    "trip_length_category",
    "is_airport_trip",
    "is_suspicious"
).dropDuplicates().withColumn("trip_category_id", monotonically_increasing_id())


In [31]:
fact_trips = df.select(
    "pickup_datetime", "dropoff_datetime", "trip_distance", "trip_duration", "trip_speed_mph", 
    "trip_efficiency", "fare_amount", "tip_amount", "tolls_amount", "total_amount", "passenger_count",
    "fare_per_mile", "fare_per_minute", "congestion_surcharge", "mta_tax", "extra", "improvement_surcharge",
    "RatecodeID", "VendorID", "payment_type", "PULocationID", "DOLocationID",
    
    # المهمين للربط
    "trip_length_category", "is_airport_trip", "is_suspicious"
)
fact_trips = fact_trips.withColumn("trip_id", monotonically_increasing_id())




In [32]:
fact_trips.printSchema()

root
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- trip_duration: long (nullable = true)
 |-- trip_speed_mph: double (nullable = true)
 |-- trip_efficiency: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- fare_per_mile: double (nullable = true)
 |-- fare_per_minute: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 

In [33]:
#============================================== Lookup ==============================================
fact_trips = fact_trips.join(
    dim_time.select("pickup_datetime", "time_id"),
    fact_trips.pickup_datetime == dim_time.pickup_datetime,
    "left"
).drop("pickup_datetime")

fact_trips = fact_trips.join(
    dim_trip_category,
    on=["trip_length_category", "is_airport_trip", "is_suspicious"],
    how="left"
).drop("trip_length_category", "is_airport_trip", "is_suspicious")


In [34]:
fact_trips.printSchema()

root
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- trip_duration: long (nullable = true)
 |-- trip_speed_mph: double (nullable = true)
 |-- trip_efficiency: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- fare_per_mile: double (nullable = true)
 |-- fare_per_minute: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_id: long (nullable = false)
 |-- time_id: lon

In [None]:
for i in df.columns :
    print(f"=================================== {i} ============================== ")
    df.select(
        sum(when(col(i).isNull(), 1).otherwise(0)).alias("null_count")
    ).show()



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                

+----------+
|null_count|
+----------+
|         0|
+----------+



                                                                                