In [39]:
!pip install numpy






In [40]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql.functions import hour, dayofweek, dayofmonth, avg, count

# Create SparkSession in local mode
spark = SparkSession.builder \
    .appName("FareTipModeling") \
    .master("local[*]") \
    .getOrCreate()


In [41]:
folder_path = Path(r"D:\L4S2\Big_Data\Assignment\NYC_Taxi_Trip_Data_Analysis\data\clean\cleanedData1")
parquet_files = sorted(folder_path.glob("*.parquet"))

print(f"Found {len(parquet_files)} parquet files:")

for f in parquet_files:
    print(f"\nReading: {f.name}")
    df = spark.read.parquet(str(f))
    df.show(5)  # Show first 5 rows

Found 9 parquet files:

Reading: part-00000-fde1b9ee-d448-4a3f-bbd6-7aff4e42f554-c000.snappy.parquet
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2024-08-01 00:21:00|  2024-08-01 00:36:13|              1|          7.4|   

In [42]:
# -------------------------------------
# 1. Trip Statistics: Average fare, distance
# -------------------------------------

# Path to Parquet files
folder_path = Path(r"\Your-path\data\cleanedData")
parquet_files = sorted(folder_path.glob("*.parquet"))

# Read all Parquet files together into one DataFrame
all_files_path = [str(f) for f in parquet_files]
df = spark.read.parquet(*all_files_path)

# Show schema to verify column names
df.printSchema()

# Compute average fare and trip distance
# Make sure to update column names based on your schema
avg_stats = df.select(
    avg("fare_amount").alias("average_fare"),
    avg("trip_distance").alias("average_distance")
)

# Show the result
avg_stats.show()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)

+------------------+------------------+
|      average_fare|  average_distance|
+------------------+------------------+
|20.8

In [43]:
pip install setuptools pandas numpy


Note: you may need to restart the kernel to use updated packages.




In [44]:
!pip install matplotlib






In [45]:
# -------------------------------------
# 2. Temporal Patterns: Hourly, Daily, Weekly Trends
# -------------------------------------

import os
import matplotlib.pyplot as plt
from pathlib import Path

# Define input and output paths
input_path = Path(r"\Your-path\data\cleanedData")
output_path = Path(r"\Your-path\output\member2_eda_output")
output_path.mkdir(parents=True, exist_ok=True)

# Read all Parquet files
all_files = sorted(input_path.glob("*.parquet"))
df = spark.read.parquet(*[str(f) for f in all_files])

# Extract time features from pickup datetime
df = df.withColumn("hour", hour("tpep_pickup_datetime")) \
       .withColumn("day_of_week", dayofweek("tpep_pickup_datetime")) \
       .withColumn("day_of_month", dayofmonth("tpep_pickup_datetime"))

# ==============================
# Hourly Trends
# ==============================
hourly_df = df.groupBy("hour").agg(
    avg("fare_amount").alias("avg_fare"),
    avg("trip_distance").alias("avg_distance"),
    count("*").alias("trip_count")
).orderBy("hour")

hourly_pd = hourly_df.toPandas()

# Plot: Average fare by hour
plt.figure(figsize=(10, 5))
plt.plot(hourly_pd["hour"], hourly_pd["avg_fare"], marker='o')
plt.title("Average Fare by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Fare")
plt.grid(True)
plt.savefig(output_path / "avg_fare_by_hour.png")
plt.close()

# Plot: Average distance by hour
plt.figure(figsize=(10, 5))
plt.plot(hourly_pd["hour"], hourly_pd["avg_distance"], marker='o', color='green')
plt.title("Average Trip Distance by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Distance")
plt.grid(True)
plt.savefig(output_path / "avg_distance_by_hour.png")
plt.close()

# Plot: Trip count by hour
plt.figure(figsize=(10, 5))
plt.bar(hourly_pd["hour"], hourly_pd["trip_count"], color='orange')
plt.title("Trip Count by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Trips")
plt.grid(True)
plt.savefig(output_path / "trip_count_by_hour.png")
plt.close()

# ==============================
# Daily of Week Trends
# ==============================
dow_df = df.groupBy("day_of_week").agg(
    avg("fare_amount").alias("avg_fare"),
    avg("trip_distance").alias("avg_distance"),
    count("*").alias("trip_count")
).orderBy("day_of_week")

dow_pd = dow_df.toPandas()

# Plot: Trip count by day of week
plt.figure(figsize=(10, 5))
plt.bar(dow_pd["day_of_week"], dow_pd["trip_count"], color='purple')
plt.title("Trip Count by Day of Week (1=Sun ... 7=Sat)")
plt.xlabel("Day of Week")
plt.ylabel("Number of Trips")
plt.grid(True)
plt.savefig(output_path / "trip_count_by_dayofweek.png")
plt.close()

# ==============================
# Day of Month Trends
# ==============================
dom_df = df.groupBy("day_of_month").agg(
    avg("fare_amount").alias("avg_fare"),
    avg("trip_distance").alias("avg_distance"),
    count("*").alias("trip_count")
).orderBy("day_of_month")

dom_pd = dom_df.toPandas()

# Plot: Trip count by day of month
plt.figure(figsize=(10, 5))
plt.bar(dom_pd["day_of_month"], dom_pd["trip_count"], color='teal')
plt.title("Trip Count by Day of Month")
plt.xlabel("Day of Month")
plt.ylabel("Number of Trips")
plt.grid(True)
plt.savefig(output_path / "trip_count_by_dayofmonth.png")
plt.close()

In [46]:
# -------------------------------------
# 3. Patterns by Passenger Count, Vendor, Distance
# -------------------------------------

# Output folder
output_folder = r"\Your-path\output\member2_eda_output"
os.makedirs(output_folder, exist_ok=True)

# =======================================
# 1. Average Fare by Passenger Count
# =======================================
passenger_df = df.groupBy("passenger_count").agg(
    avg("fare_amount").alias("avg_fare"),
    count("*").alias("trip_count")
).orderBy("passenger_count")

passenger_pd = passenger_df.toPandas()

plt.figure(figsize=(10, 5))
plt.bar(passenger_pd["passenger_count"], passenger_pd["avg_fare"], color="orange")
plt.title("Average Fare by Passenger Count")
plt.xlabel("Passenger Count")
plt.ylabel("Average Fare ($)")
plt.grid(axis='y')
plt.savefig(os.path.join(output_folder, "avg_fare_by_passenger_count.png"))
plt.close()

# =======================================
# 2. Average Fare by Vendor
# =======================================
vendor_df = df.groupBy("VendorID").agg(
    avg("fare_amount").alias("avg_fare"),
    count("*").alias("trip_count")
).orderBy("VendorID")

vendor_pd = vendor_df.toPandas()

plt.figure(figsize=(8, 5))
plt.bar(vendor_pd["VendorID"], vendor_pd["avg_fare"], color="skyblue")
plt.title("Average Fare by Vendor ID")
plt.xlabel("Vendor ID")
plt.ylabel("Average Fare ($)")
plt.grid(axis='y')
plt.savefig(os.path.join(output_folder, "avg_fare_by_vendor.png"))
plt.close()

# =======================================
# 3. Fare vs. Distance (Scatter Plot)
# =======================================
# Sample a smaller subset to avoid plotting millions of points
scatter_sample_df = df.select("trip_distance", "fare_amount").dropna().filter(
    (col("trip_distance") > 0) & (col("trip_distance") < 50) &
    (col("fare_amount") > 0) & (col("fare_amount") < 300)
).sample(False, 0.001)  # Adjust fraction if needed

scatter_pd = scatter_sample_df.toPandas()

plt.figure(figsize=(10, 6))
plt.scatter(scatter_pd["trip_distance"], scatter_pd["fare_amount"], alpha=0.3, s=10)
plt.title("Fare Amount vs Trip Distance")
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Fare Amount ($)")
plt.grid(True)
plt.savefig(os.path.join(output_folder, "fare_vs_distance_scatter.png"))
plt.close()

In [47]:
# -------------------------------------
# End Spark session
# -------------------------------------
spark.stop()