### Big df course project
<strong>T4: Initial EDA</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import os
import duckdb
import geopandas as gpd
import numpy as np
import time
import dask.dataframe as dd
import duckdb
from dask_sql import Context

In [None]:
import seaborn as sb

In [None]:
from dask.distributed import Client, LocalCluster
from pathlib import Path

In [None]:
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True

plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

plt.rcParams['axes.facecolor']='w'

### Loading the df

In [None]:
service_type = "fhvhv"

In [None]:
trip_dist_col = "trip_distance" if service_type != "fhvhv" else "trip_miles"
fare_amount_col = "fare_amount" if service_type != "fhvhv" else "driver_pay"
tip_amount_col = "tip_amount" if service_type != "fhvhv" else "tips"

In [None]:
part_df_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")
vis_path = Path("/d/hpc/home/jv8043/BD/project/T4/T4_vis") / service_type
tables_path = Path("/d/hpc/home/jv8043/BD/project/T4/T4_tables") / service_type

In [None]:
os.makedirs(vis_path, exist_ok=True)
os.makedirs(tables_path, exist_ok=True)

In [None]:
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='32GB')
client = Client(cluster)

In [None]:
client

In [None]:
df = dd.read_parquet(part_df_path / service_type, engine="pyarrow", assume_missing=True)

In [None]:
# sample 0.001% of the data
df = df.sample(frac=0.01, random_state=42)

In [None]:
display(df)

In [None]:
print("Number of partitions:", df.npartitions)
nrows = df.shape[0].compute()
print("Number of rows:", nrows)
print("Number of columns:", len(df.columns))
print("Column names:", df.columns.tolist())

In [None]:
taxi_zone_lookup = pd.read_csv("/d/hpc/home/jv8043/BD/project/add_data/taxi_zone_lookup.csv")

#### Starting EDA

<strong>Checking for missing values</strong>

In [None]:
missing_per_column = df.isna().sum()
missing_per_column_result = missing_per_column.compute()

In [None]:
missing_per_column_percentage = round((missing_per_column_result / nrows) * 100, 2)
# sort the results by percentage
missing_per_column_percentage = missing_per_column_percentage.sort_values(ascending=False)
print("Missing values per column:")
print(missing_per_column_percentage)

missing_per_column_percentage.to_csv(tables_path / "missing_per_column.csv", header=True)

In [None]:
df = df.dropna(subset=["pickup_lat", "pickup_lon", "dropoff_lat", "dropoff_lon"])

In [None]:
# drop the rows with missing values in the 'pickup_datetime' column or 'dropoff_datetime' column
df = df.dropna(subset=['pickup_datetime', 'dropoff_datetime'])

<strong>Trip distance & duration analysis</strong>

In [None]:
print("Percentage of rides that lasted less than 1 minute:", round(len(df[df["dropoff_datetime"] - df["pickup_datetime"] < pd.Timedelta(minutes=1)]) / len(df) * 100, 2), "%")
print("Percentage of rides that lasted less than 5 minutes:", round(len(df[df["dropoff_datetime"] - df["pickup_datetime"] < pd.Timedelta(minutes=5)]) / len(df) * 100, 2), "%")

In [None]:
print("Percentage of rides that lasted more than 60 minutes:", round(len(df[df["dropoff_datetime"] - df["pickup_datetime"] > pd.Timedelta(minutes=60)]) / len(df) * 100, 2), "%")

In [None]:
df_copy = df.sample(frac=0.01, random_state=42).copy()
df_copy["trip_duration_mins"] = (df["dropoff_datetime"] - df["pickup_datetime"]).dt.total_seconds() / 60
df_copy["trip_duration_mins"] = df_copy["trip_duration_mins"].astype(int)
df_copy = df_copy[df_copy["trip_duration_mins"] <= 60]
df_copy = df_copy[df_copy["trip_duration_mins"] >= 0]

In [None]:
df_copy["duration_bin"] = (df_copy["trip_duration_mins"] // 10) * 10
df_copy["duration_bin"] = df_copy["duration_bin"].astype(int)

In [None]:
# skip for now
# # Group by duration bin label and count
# bin_counts = df_copy[["duration_bin"]].dropna().compute()
# # Compute total number of rows for percentage calculation
# total_count = len(bin_counts)

# bin_counts = bin_counts["duration_bin"].value_counts()

# # Compute both
# bin_counts = bin_counts
# total_count = total_count

# # Calculate percentage
# bin_percentages = (bin_counts / total_count) * 100

# # Print sorted by bin
# bin_percentages = bin_percentages.sort_index()
# print(bin_percentages)

In [None]:
# skip for now
# fig, ax = plt.subplots(figsize=(6, 3))
# ax.bar(bin_percentages.index.astype(str), bin_percentages.values, color='salmon', edgecolor='black')

# ax.set_xlabel("Duration (min)")
# ax.set_ylabel("Rides (%)")
# ax.set_xticklabels(bin_percentages.index.astype(str), rotation=45, ha='right')

# for sp in ["left", "bottom"]:
#     ax.spines[sp].set_color('0.5')

# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)

# # annotate the percentage on top of each bar
# for i, v in enumerate(bin_percentages.values):
#     ax.text(i, v + 0.5, f"{v:.2f}%", ha='center', va='bottom', fontsize=8)

# fig.tight_layout()
# fig.savefig(vis_path / "duration_bins_percentage.pdf", dpi=300, bbox_inches='tight')
# plt.show()

In [None]:
if service_type not in ["fhv"]:
    # Compute average and std of trip_distance
    
    trip_distance_mean = df[trip_dist_col].mean().compute()
    trip_distance_std = df[trip_dist_col].std().compute()

    print("Average trip distance, STD:", round(trip_distance_mean, 2), round(trip_distance_std, 2), "miles")

# Compute trip duration in minutes
trip_duration = (df["dropoff_datetime"] - df["pickup_datetime"]).dt.total_seconds() / 60

trip_duration_mean = trip_duration.mean().compute()
trip_duration_std = trip_duration.std().compute()

print("Average trip duration, STD:", round(trip_duration_mean, 2), round(trip_duration_std, 2), "minutes")

In [None]:
if service_type not in ["fhv"]:
    # Filter only trips with distance <= 200 miles and those with pulocationid and dolocationid not null
    df_copy = df[
        (df[trip_dist_col] <= 200) &
        (~df["pulocationid"].isin([264, 265])) &
        (~df["dolocationid"].isin([264, 265]))
    ].copy()

    # Compute the top 1% of trips by trip distance
    top_1_percent_threshold = df_copy[trip_dist_col].quantile(0.99).compute()
    print("Top 1% trip distance threshold:", round(top_1_percent_threshold, 2), "miles")
    top_1_percent_trips = df_copy[df_copy[trip_dist_col] >= top_1_percent_threshold]

    # Join with taxi zone lookup for pickup zone
    top_1_percent_trips = top_1_percent_trips.merge(
        taxi_zone_lookup,
        left_on="pulocationid",
        right_on="LocationID",
        how="left"
    ).rename(columns={"Zone": "pickup_zone"})

    # Join with taxi zone lookup for dropoff zone
    top_1_percent_trips = top_1_percent_trips.merge(
        taxi_zone_lookup,
        left_on="dolocationid",
        right_on="LocationID",
        how="left"
    ).rename(columns={"Zone": "dropoff_zone"})

    # Select and reorder relevant columns
    top_1_percent_trips = top_1_percent_trips[[
        "pickup_zone", "dropoff_zone", trip_dist_col,
        "pickup_datetime", "dropoff_datetime"
    ]]

    # Sort and reset index after computing
    top_1_percent_trips = top_1_percent_trips.compute()
    top_1_percent_trips = top_1_percent_trips.sort_values(
        by=trip_dist_col, ascending=False
    ).reset_index(drop=True)

    # Select columns to display
    top_1_percent_trips = top_1_percent_trips[["pickup_zone", "dropoff_zone", trip_dist_col]]

    # Show top 10
    display(top_1_percent_trips)
    # Save the top 1% trips to a CSV file
    top_1_percent_trips.to_csv(tables_path / "top_1_percent_trips.csv", index=False)

<strong>Fare amount analysis</strong>

In [None]:
# Get avg tip amount
avg_tip_amount = df[tip_amount_col].mean().compute()
print("Average tip amount", avg_tip_amount, "$")
# Get avg fare amount
avg_fare_amount = df[fare_amount_col].mean().compute()
print("Average fare amount", avg_fare_amount, "$")

In [None]:
print("Lowest / highest 1% of fare amount:", round(df[fare_amount_col].quantile(0.01).compute()), "$ /", round(df[fare_amount_col].quantile(0.99).compute()), "$")
print("Highest fare amount:", round(df[fare_amount_col].max().compute()))

In [None]:
data_ = df.copy().sample(frac=0.0001, random_state=42)

# Apply filters
data_ = data_[
    (data_[fare_amount_col] >= 0) &
    (data_[fare_amount_col] <= 100) &
    (data_[trip_dist_col] >= 0) &
    (data_[trip_dist_col] <= 31)
]

In [None]:
# Drop rows with missing values in fare_amount or trip_distance
data_ = data_.dropna(subset=[fare_amount_col, trip_dist_col])

# Sample and compute to pandas
data_pd = data_[[trip_dist_col, fare_amount_col]].compute()

# Plot
fig, ax = plt.subplots(figsize=(4, 3))
ax.scatter(data_pd[trip_dist_col], data_pd[fare_amount_col], alpha=0.5, color='salmon', s=1, rasterized=True)
ax.set_xlabel("Trip distance (miles)")
ax.set_ylabel("Fare amount ($)")
ax.spines['left'].set_color('0.5')
ax.spines['bottom'].set_color('0.5')
ax.set_title(service_type.capitalize())
fig.tight_layout()
fig.savefig(vis_path / "scatter_trip_distance_fare.pdf", dpi=300, bbox_inches='tight')
plt.show()

##### Temporal aggregation

In this part, we perform temporal aggregation on pickup or dropoff times. We first analyse rides distribution per pickup and dropoff time periods.

In [None]:
# Copy data
data_ = df.copy()

# Extract pickup and dropoff hour (0–23)
data_["pickup_hour"] = data_["pickup_datetime"].dt.hour
data_["dropoff_hour"] = data_["dropoff_datetime"].dt.hour

# Compute value counts
pickup_counts = data_["pickup_hour"].value_counts().compute().sort_index()
dropoff_counts = data_["dropoff_hour"].value_counts().compute().sort_index()

# Convert counts to percentages
pickup_percent = (pickup_counts / pickup_counts.sum()) * 100
dropoff_percent = (dropoff_counts / dropoff_counts.sum()) * 100

# Plot
fig, axs = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

# Pickup plot
axs[0].bar(pickup_percent.index, pickup_percent.values, color='salmon', edgecolor='black')
axs[0].set_xlabel("Pickup hour")
axs[0].set_ylabel("Percentage of rides (%)")
axs[0].set_xticks(range(24))
axs[0].set_xticklabels([f"{h:02d}" for h in range(24)], rotation=45, ha='right')
axs[0].set_title("Pickup time distribution")

# Dropoff plot
axs[1].bar(dropoff_percent.index, dropoff_percent.values, color='salmon', edgecolor='black')
axs[1].set_xlabel("Dropoff hour")
axs[1].set_xticks(range(24))
axs[1].set_xticklabels([f"{h:02d}" for h in range(24)], rotation=45, ha='right')
axs[1].set_title("Dropoff time distribution")

# Style
for ax in axs:
    for sp in ["left", "bottom"]:
        ax.spines[sp].set_color('0.5')

fig.tight_layout()
fig.savefig(vis_path / "pickup_dropoff_hour_distribution.pdf", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Copy the Dask DataFrame
data_ = df.copy()

# Extract pickup and dropoff hour (0–23)
data_["pickup_hour"] = data_["pickup_datetime"].dt.hour
data_["dropoff_hour"] = data_["dropoff_datetime"].dt.hour

# Group by pickup hour
median_distance_pickup = data_.groupby("pickup_hour")[trip_dist_col].median().compute()
pickup_counts = data_["pickup_hour"].value_counts().compute().sort_index()

# Group by dropoff hour
median_distance_dropoff = data_.groupby("dropoff_hour")[trip_dist_col].median().compute()
dropoff_counts = data_["dropoff_hour"].value_counts().compute().sort_index()

# Convert counts to percentages
pickup_percentages = (pickup_counts / pickup_counts.sum()) * 100
dropoff_percentages = (dropoff_counts / dropoff_counts.sum()) * 100

# Sort by hour
median_distance_pickup = median_distance_pickup.sort_index()
median_distance_dropoff = median_distance_dropoff.sort_index()
pickup_percentages = pickup_percentages.sort_index()
dropoff_percentages = dropoff_percentages.sort_index()

# Plot
fig, axs = plt.subplots(1, 2, figsize=(11, 4), sharey=True)

# Pickup plot
ax = axs[0]
median_distance_pickup.plot(kind='line', marker='o', ax=ax, color='salmon')
ax.set_xlabel("Pickup hour")
ax.set_ylabel("Median trip distance (miles)")
ax.set_xticks(range(24))
ax.set_xticklabels([f"{h:02d}" for h in range(24)], rotation=45, ha='right')
# show with dotted line also median overall from all the data
trip_overall_median = data_[trip_dist_col].median_approximate().compute()
ax.axhline(trip_overall_median, color='gray', linestyle='--', label='Overall median')
ax.legend()

# for i, (val, pct) in enumerate(zip(median_distance_pickup, pickup_percentages)):
#     ax.annotate(f"{pct:.1f}%", (i, val), textcoords="offset points", xytext=(0, 6), ha='center', fontsize=8)

# Dropoff plot
ax = axs[1]
median_distance_dropoff.plot(kind='line', marker='o', ax=ax, color='salmon')
ax.set_xlabel("Dropoff hour")
ax.set_xticks(range(24))
ax.set_xticklabels([f"{h:02d}" for h in range(24)], rotation=45, ha='right')
# for i, (val, pct) in enumerate(zip(median_distance_dropoff, dropoff_percentages)):
#     ax.annotate(f"{pct:.1f}%", (i, val), textcoords="offset points", xytext=(0, 6), ha='center', fontsize=8)
# show with dotted line also median overall from all the data
ax.axhline(trip_overall_median, color='gray', linestyle='--', label='Overall median')
ax.legend()

# Style
for ax in axs:
    for sp in ["left", "bottom"]:
        ax.spines[sp].set_color('0.5')

fig.tight_layout()
fig.savefig(vis_path / "median_trip_distance_by_hour.pdf", dpi=300, bbox_inches='tight')
plt.show()

##### Spatial aggregation

Next, we do spatial aggregation, by pickup and dropoff location taxi zone. We show it directly on NYC map.

In [None]:
data_ = df.copy()

# Merge with taxi_zone_lookup to get pickup zone
data_ = data_.merge(taxi_zone_lookup, left_on="pulocationid", right_on="LocationID", how="left")
data_ = data_.rename(columns={"Zone": "pickup_zone"}).drop("LocationID", axis=1)

# Merge again to get dropoff zone
data_ = data_.merge(taxi_zone_lookup, left_on="dolocationid", right_on="LocationID", how="left")
data_ = data_.rename(columns={"Zone": "dropoff_zone"}).drop("LocationID", axis=1)

# Select relevant columns
data_ = data_[["pickup_zone", "dropoff_zone", trip_dist_col, "pickup_datetime", "dropoff_datetime"]]

# Sort by trip_distance
sampled = data_.compute() 
sampled = sampled.sort_values(by=trip_dist_col, ascending=False).reset_index(drop=True)

# Show top 1
display(sampled.head(10))

In [None]:
# Count pickups and dropoffs by zone
pickup_counts = data_["pickup_zone"].value_counts()
dropoff_counts = data_["dropoff_zone"].value_counts()

# Compute them (convert to pandas)
pickup_counts = pickup_counts.compute()
dropoff_counts = dropoff_counts.compute()

# Normalize to percentages
pickup_percentages = (pickup_counts / pickup_counts.sum()) * 100
dropoff_percentages = (dropoff_counts / dropoff_counts.sum()) * 100

# Get top zones and percentages
top_pickup_zone = pickup_percentages.idxmax()
top_pickup_pct = round(pickup_percentages.max(), 2)

top_dropoff_zone = dropoff_percentages.idxmax()
top_dropoff_pct = round(dropoff_percentages.max(), 2)

# Print summary
print(f"Locations with the most pickups and dropoffs: {top_pickup_zone} ({top_pickup_pct}%) / {top_dropoff_zone} ({top_dropoff_pct}%).")

In [None]:
# print top 5 pickup zones and dropoff zones
# first sort the percentages
pickup_percentages = pickup_percentages.sort_values(ascending=False)
dropoff_percentages = dropoff_percentages.sort_values(ascending=False)
print("Top 5 pickup zones:")
display(pickup_percentages.head(5))
print("Top 5 dropoff zones:")
display(dropoff_percentages.head(5))

# Save the pickup and dropoff zone percentages to CSV files
pickup_percentages.to_csv(tables_path / "pickup_zone_percentages.csv", header=True, index=True)
dropoff_percentages.to_csv(tables_path / "dropoff_zone_percentages.csv", header=True, index=True)

In [None]:
# Create a Dask copy
data_ = df.copy()

# Compute pickup counts
pickup_counts = data_['pulocationid'].value_counts().compute().reset_index()
pickup_counts.columns = ['LocationID', 'pickup_count']
total_pickups = pickup_counts['pickup_count'].sum()
pickup_counts['pickup_percent'] = 100 * pickup_counts['pickup_count'] / total_pickups

# Compute dropoff counts
dropoff_counts = data_['dolocationid'].value_counts().compute().reset_index()
dropoff_counts.columns = ['LocationID', 'dropoff_count']
total_dropoffs = dropoff_counts['dropoff_count'].sum()
dropoff_counts['dropoff_percent'] = 100 * dropoff_counts['dropoff_count'] / total_dropoffs

# Load taxi zones shapefile
zones = gpd.read_file("/d/hpc/home/jv8043/BD/project/T4/taxi_zones.shp")
zones["LocationID"] = zones["LocationID"].astype(int)

# Merge pickup and dropoff percentages
zones = zones.merge(pickup_counts[['LocationID', 'pickup_percent']], on="LocationID", how="left")
zones = zones.merge(dropoff_counts[['LocationID', 'dropoff_percent']], on="LocationID", how="left")

# Filter to Manhattan and Harlem
# manhattan_zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM"])]

if service_type in ["green", "fhv", "fhvhv"]:
    zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM", "BROOKLYN", "QUEENS"])]
elif service_type == "yellow":
    zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM"])]

# Shared color scale range
vmin = 0
vmax = max(
    zones['pickup_percent'].max(),
    zones['dropoff_percent'].max()
)

# Plot
fig, axs = plt.subplots(1, 2, figsize=(14, 8))

# Pickup heatmap
pickup_ax = axs[0]
pickup_plot = zones.plot(
    column="pickup_percent",
    cmap="OrRd",
    linewidth=0.5,
    edgecolor="black",
    legend=False,
    ax=pickup_ax,
    missing_kwds={"color": "lightgrey", "label": "No data"},
    vmin=vmin,
    vmax=vmax
)
pickup_ax.set_title("Pickup Percentage", fontsize=14)
pickup_ax.axis("off")

# Dropoff heatmap
dropoff_ax = axs[1]
dropoff_plot = zones.plot(
    column="dropoff_percent",
    cmap="OrRd",
    linewidth=0.5,
    edgecolor="black",
    legend=False,
    ax=dropoff_ax,
    missing_kwds={"color": "lightgrey", "label": "No data"},
    vmin=vmin,
    vmax=vmax
)
dropoff_ax.set_title("Dropoff Percentage", fontsize=14)
dropoff_ax.axis("off")

# Add shared colorbar
fig.colorbar(dropoff_plot.get_children()[0], ax=dropoff_ax, orientation='vertical', fraction=0.02, pad=0.04)

fig.tight_layout()
fig.savefig(vis_path / "pickup_dropoff_heatmap.pdf", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Copy and filter using Dask
data_ = df.copy()
fare_threshold = data_[fare_amount_col].quantile(0.95).compute()
data_ = data_[data_[fare_amount_col] < fare_threshold]

In [None]:
# Group by and compute median using Dask, then convert to pandas
pickup_median_fare = data_.groupby('pulocationid')[fare_amount_col].median().compute().reset_index(name='pickup_median_fare')
dropoff_median_fare = data_.groupby('dolocationid')[fare_amount_col].median().compute().reset_index(name='dropoff_median_fare')

In [None]:
# Read shapefile with GeoPandas
zones = gpd.read_file("/d/hpc/home/jv8043/BD/project/T4/taxi_zones.shp")
zones["LocationID"] = zones["LocationID"].astype(int)

# Merge with pickup and dropoff median fares
zones = zones.merge(pickup_median_fare, left_on='LocationID', right_on='pulocationid', how='left')
zones = zones.merge(dropoff_median_fare, left_on='LocationID', right_on='dolocationid', how='left')

# Filter to Manhattan and Harlem
# manhattan_zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM"])]
if service_type in ["green", "fhv", "fhvhv"]:
    zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM", "BROOKLYN", "QUEENS"])]
elif service_type == "yellow":
    zones = zones[zones['borough'].str.upper().isin(["MANHATTAN", "HARLEM"])]

# Define color scale
vmin = min(zones['pickup_median_fare'].min(), zones['dropoff_median_fare'].min())
vmax = max(zones['pickup_median_fare'].max(), zones['dropoff_median_fare'].max())

# Plot
fig, axs = plt.subplots(1, 2, figsize=(14, 8))

# Pickup
ax = axs[0]
pickup_plot = zones.plot(column="pickup_median_fare", cmap="OrRd", linewidth=0.5, edgecolor="black",
                                   legend=False, ax=ax, missing_kwds={"color": "lightgrey", "label": "No data"},
                                   vmin=vmin, vmax=vmax)
ax.set_title("Median fare per pickup zone", fontsize=14)
ax.axis("off")

# Dropoff
ax = axs[1]
dropoff_plot = zones.plot(column="dropoff_median_fare", cmap="OrRd", linewidth=0.5, edgecolor="black",
                                    legend=False, ax=ax, missing_kwds={"color": "lightgrey", "label": "No data"},
                                    vmin=vmin, vmax=vmax)
ax.set_title("Median fare per dropoff zone", fontsize=14)
ax.axis("off")

# Shared colorbar
fig.colorbar(dropoff_plot.get_children()[0], ax=ax, orientation='vertical', fraction=0.02, pad=0.04)
fig.tight_layout()
fig.savefig(vis_path / "pickup_dropoff_median_fare.pdf", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Shutdown the Dask client and cluster
client.shutdown()
cluster.close()