In [2]:
!pip install azure-storage-blob


Collecting azure-storage-blob
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)
Collecting azure-core>=1.30.0 (from azure-storage-blob)
  Downloading azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.0/407.0 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading azure_core-1.33.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.1/207.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.33.0 a

In [8]:
# -------------------------
# 1. Connection to Azure Blob Storage
# -------------------------

from azure.storage.blob import BlobServiceClient
import os
# Imports for Azure Blob connection and file handling
from azure.storage.blob import BlobServiceClient
import os

# PySpark imports
from pyspark.sql.functions import col, to_timestamp, window, avg, count
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("UberAnalyticsBlob") \
    .getOrCreate()

# Credentials
conn_str = "DefaultEndpointsProtocol=https;AccountName=iesstsabbadbab;AccountKey=/Z4VcADF8fi/0zqf5v4aJk47k5MAUZFTVx7bkbdId3N0zG+UQv7bmA9Qr6ygGEGMEYwikrOBfRjk+AStl5SehA==;EndpointSuffix=core.windows.net"
container_name = "group4"
folder_prefix_rides = "ride_stream/part"
folder_prefix_traffic = "traffic_stream/part"

# Client
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
container_client = blob_service_client.get_container_client(container_name)

# Download Parquet files
def download_parquet(folder_prefix):
    parquet_files = []
    for blob in container_client.list_blobs(name_starts_with=folder_prefix):
        if blob.name.endswith(".parquet"):
            print(f"⬇Downloading: {blob.name}")
            local_path = os.path.basename(blob.name)
            blob_client = container_client.get_blob_client(blob)
            with open(local_path, "wb") as f:
                f.write(blob_client.download_blob().readall())
            parquet_files.append(local_path)
    return parquet_files

# Download rides and traffic
parquet_rides = download_parquet(folder_prefix_rides)
parquet_traffic = download_parquet(folder_prefix_traffic)

# -------------------------
# 2. Load data in Spark
# -------------------------

# Read downloaded Parquet files from blob
df_rides = spark.read.parquet(*parquet_rides)
df_traffic = spark.read.parquet(*parquet_traffic)

df_rides.printSchema()
df_traffic.printSchema()

df_rides.show(5)
df_traffic.show(5)

# -------------------------
# 3. Transform with Spark SQL / PySpark API
# -------------------------

# Convert to timestamp if not already
df_rides = df_rides.withColumn("timestamp_event", to_timestamp(col("timestamp_event")))
df_traffic = df_traffic.withColumn("timestamp", to_timestamp(col("timestamp")))

# -------------------------
# 4. Analysis and Insights
# -------------------------

print("Total trips every 10 minutes:\n")
df_rides.groupBy(
    window(col("timestamp_event"), "10 minutes")
).agg(
    count("*").alias("num_rides")
).orderBy("window").show(truncate=False)

print("Total trips per event type every 10 minutes:\n")
df_rides.groupBy(
    window(col("timestamp_event"), "10 minutes"),
    col("event_type")
).agg(
    count("*").alias("num_events")
).orderBy("window", "event_type").show(truncate=False)

print("Average price per Uber type:\n")
df_rides.filter(col("event_type") == "Start car ride").groupBy("uber_type").agg(
    avg("price").alias("avg_price")
).orderBy("avg_price", ascending=False).show(truncate=False)

print("Number of traffic alerts per zone:\n")
df_traffic.groupBy("zone_id").agg(
    count("*").alias("num_alerts")
).orderBy("num_alerts", ascending=False).show(truncate=False)

print("Average surge multiplier per traffic level:\n")
df_traffic.groupBy("traffic_level").agg(
    avg("surge_multiplier").alias("avg_surge")
).orderBy("traffic_level").show(truncate=False)

# -------------------------
# EXTENSIONS FOR INTERMEDIATE / ADVANCED LEVEL
# -------------------------

from pyspark.sql.functions import unix_timestamp, stddev, mean, when, lit, abs, count, avg

print("Trips per event type:")
df_rides.groupBy("event_type").count().orderBy("event_type").show()

if all(col_name in df_rides.columns for col_name in ["pickup_time", "dropoff_time", "request_time"]):
    df_rides = df_rides \
        .withColumn("driver_response", unix_timestamp("pickup_time") - unix_timestamp("request_time")) \
        .withColumn("ride_duration", unix_timestamp("dropoff_time") - unix_timestamp("pickup_time"))

    print("Trip duration and driver response time:")
    df_rides.select("ride_id", "driver_response", "ride_duration").show(5)

total_rides = df_rides.count()
cancelled_rides = df_rides.filter(col("event_type") == "cancelled").count()
cancellation_rate = (cancelled_rides / total_rides) * 100
print(f"Cancelations: {cancelled_rides} of {total_rides} trips ({cancellation_rate:.2f}%)")

if "surge_multiplier" in df_traffic.columns:
    print("Zones with high surge price (historical):")
    df_traffic.groupBy("zone_id").agg(avg("surge_multiplier").alias("avg_surge")) \
        .orderBy("avg_surge", ascending=False).show(5)

if "start_location" in df_rides.columns:
    print("Locations with unusually frequent events:")
    df_rides.groupBy("start_location").agg(count("*").alias("event_count")) \
        .orderBy("event_count", ascending=False).show(5)

print("\n1. Number of events per Uber type:")
df_rides.groupBy("uber_type").count().orderBy("count", ascending=False).show()

print("\n2. Average price per Uber type:")
df_rides.filter(col("event_type") == "Start car ride").groupBy("uber_type").agg(avg("price").alias("avg_price")).orderBy("avg_price", ascending=False).show()

print("\n3. Distribution of trips per time range:")
from pyspark.sql.functions import hour, when
df_rides = df_rides.withColumn("hour", hour("timestamp_event"))
df_rides = df_rides.withColumn("time_slot", when((col("hour") >= 6) & (col("hour") < 12), "Morning")
                                .when((col("hour") >= 12) & (col("hour") < 18), "Afternoon")
                                .when((col("hour") >= 18) & (col("hour") < 24), "Evening")
                                .otherwise("Early Morning"))
df_rides.groupBy("time_slot").count().orderBy("time_slot").show()

print("\n3. Most frequent events:")
df_rides.groupBy("start_location", "end_location").count().orderBy("count", ascending=False).show(10)

print("\n5. Average price per start zone:")
df_rides.filter(col("event_type") == "Start car ride").groupBy("start_location").agg(avg("price").alias("avg_price")).orderBy("avg_price", ascending=False).show(10)

print("\n6. Number of alerts per event type:")
df_traffic.groupBy("event_type").count().orderBy("count", ascending=False).show()

print("\n7. Zones with more traffic alerts:")
df_traffic.groupBy("zone_id").count().orderBy("count", ascending=False).show(10)

print("\n8. Average congestion level (surge multiplier) per zone:")
df_traffic.groupBy("zone_id").agg(avg("surge_multiplier").alias("avg_surge")) \
    .orderBy("avg_surge", ascending=False).show(10)

print("\n9. Type of Uber mostly used in areas with severe traffic:")
severe_zones = df_traffic.filter(col("traffic_level") == "severe").select("zone_id").distinct()
df_rides_severe = df_rides.join(severe_zones, df_rides["start_location"] == severe_zones["zone_id"])
df_rides_severe.groupBy("uber_type").count().orderBy("count", ascending=False).show()


⬇Downloading: ride_stream/part-00000-49a9a2de-9db2-490b-b6d4-ab157299a5e1-c000.snappy.parquet
⬇Downloading: ride_stream/part-00000-5ee4e9f2-a419-4935-92dc-e5fca90de581-c000.snappy.parquet
⬇Downloading: ride_stream/part-00000-8c572137-38fd-410c-9517-9ad24b5faf1c-c000.snappy.parquet
⬇Downloading: ride_stream/part-00000-b5e3df0f-e2d5-4162-9dc0-9eedf8f0bd86-c000.snappy.parquet
⬇Downloading: ride_stream/part-00000-e611da15-f07f-4d3c-bfad-8552a164d778-c000.snappy.parquet
⬇Downloading: traffic_stream/part-00000-9c627dd1-f19d-4eed-97b5-7e8c090e4d77-c000.snappy.parquet
⬇Downloading: traffic_stream/part-00000-9ca600c0-041d-4dc1-9933-ee0353739c8d-c000.snappy.parquet
⬇Downloading: traffic_stream/part-00000-aafee65b-4951-4683-85d5-f6608c4b8218-c000.snappy.parquet
⬇Downloading: traffic_stream/part-00000-ca4b202e-fce1-472c-be7d-58c44fbbc5d8-c000.snappy.parquet
⬇Downloading: traffic_stream/part-00000-fd8e2b10-f331-44a7-90c4-a6bc6f5740da-c000.snappy.parquet
root
 |-- event_id: string (nullable = true)
