In [0]:
# Databricks-ready PySpark code for dataset: ott_user_behavior_1
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("OTT_Analytics").getOrCreate()

# ---------- 1) Load dataset (try Spark table, else CSV fallback) ----------
dataset_name = "ott_user_behavior_1"

try:
    df = spark.table(dataset_name)
    print(f"Loaded Spark table: {dataset_name}")
except Exception as e:
    print(f"Table {dataset_name} not found, trying CSV at /FileStore/tables/{dataset_name}.csv")
    df = spark.read.option("header", "true").option("inferSchema", "true") \
            .csv(f"/FileStore/tables/{dataset_name}.csv")

# quick clean / cast commonly-needed columns
# adjust column names if your schema differs (case-sensitive in Spark)
# Ensure numeric columns are numeric
num_cols = ["age", "avg_watch_time_per_day", "total_watch_time", "watch_duration",
            "completion_rate", "rating_given", "total_sessions_per_week",
            "avg_stream_quality", "network_speed_mbps", "buffering_count",
            "monthly_spend", "peak_watch_hour", "churn_flag"]
for c in num_cols:
    if c in df.columns:
        df = df.withColumn(c, col(c).cast(DoubleType()))

# Fill NA for numeric cols with 0 for safe model training (simple strategy)
numeric_present = [c for c in num_cols if c in df.columns]
df = df.fillna({c: 0.0 for c in numeric_present})

Loaded Spark table: ott_user_behavior_1


In [0]:
from pyspark.sql.functions import col, lit, least, greatest, explode
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS

# ✅ Load directly from catalog, not CSV
df = spark.table("workspace.default.ott_user_behavior_1")

In [0]:
# ---------- USE CASE 3: CHURN PREDICTION (Final Fixed Version) ----------

from pyspark.sql.functions import (
    col, when, avg, round, countDistinct, desc, regexp_replace
)

# ✅ Step 1: Load dataset
df = spark.table("workspace.default.ott_user_behavior_1")

# ✅ Step 2: Clean and standardize avg_stream_quality
# Convert quality labels ('480p', '720p', '1080p', '4K') → numeric resolution
quality_mapping = {
    "480p": 480,
    "720p": 720,
    "1080p": 1080,
    "4K": 2160,
    "2160p": 2160
}

# Map text qualities to numbers
df = df.withColumn(
    "avg_stream_quality",
    when(col("avg_stream_quality").isin(list(quality_mapping.keys())),
         when(col("avg_stream_quality") == "480p", 480)
         .when(col("avg_stream_quality") == "720p", 720)
         .when(col("avg_stream_quality") == "1080p", 1080)
         .when(col("avg_stream_quality") == "4K", 2160)
         .when(col("avg_stream_quality") == "2160p", 2160))
    .otherwise(
        when(col("avg_stream_quality").rlike("^[0-9.]+$"), col("avg_stream_quality").cast("double"))
    )
)

# ✅ Clean other numeric columns safely
for c in ["network_speed_mbps", "buffering_count"]:
    if c in df.columns:
        df = df.withColumn(
            c,
            when(col(c).rlike("^[0-9.]+$"), col(c).cast("double")).otherwise(None)
        )

# ✅ Step 3: Define churn
df_churn = df.withColumn(
    "churn_flag",
    when((col("avg_watch_time_per_day") < 1) | (col("completion_rate") < 0.3), 1).otherwise(0)
)

# ✅ Step 4: Churn rate by region
if "region" in df_churn.columns:
    churn_by_region = (
        df_churn.groupBy("region")
        .agg(
            round(avg("churn_flag"), 3).alias("avg_churn_rate"),
            round(avg("avg_watch_time_per_day"), 2).alias("avg_watch_time_per_day"),
            round(avg("completion_rate"), 2).alias("avg_completion_rate"),
            countDistinct("user_id").alias("unique_users")
        )
        .orderBy(desc("avg_churn_rate"))
    )
    display(churn_by_region)

# ✅ Step 5: Churn by subscription type
if "subscription_type" in df_churn.columns:
    churn_by_plan = (
        df_churn.groupBy("subscription_type")
        .agg(
            round(avg("churn_flag"), 3).alias("avg_churn_rate"),
            round(avg("avg_watch_time_per_day"), 2).alias("avg_watch_time_per_day"),
            round(avg("completion_rate"), 2).alias("avg_completion_rate")
        )
        .orderBy(desc("avg_churn_rate"))
    )
    display(churn_by_plan)

# ✅ Step 6: Behavioral comparison
if all(c in df_churn.columns for c in ["avg_watch_time_per_day", "completion_rate", "avg_stream_quality"]):
    churn_behavior = (
        df_churn.groupBy("churn_flag")
        .agg(
            round(avg("avg_watch_time_per_day"), 2).alias("avg_watch_time_per_day"),
            round(avg("completion_rate"), 2).alias("avg_completion_rate"),
            round(avg("avg_stream_quality"), 2).alias("avg_stream_quality"),
            round(avg("network_speed_mbps"), 2).alias("avg_network_speed"),
            countDistinct("user_id").alias("users")
        )
        .orderBy("churn_flag")
    )
    display(churn_behavior)

# ✅ Step 7: Save results
try:
    df_churn.write.mode("overwrite").format("delta").saveAsTable("ott_churn_predictions")
    print("✅ Churn summary written successfully to: ott_churn_predictions")
except Exception as e:
    print(f"⚠️ Could not save churn table: {e}")

# ✅ Step 8: Recommended Databricks Visualizations
# 1️⃣ Bar Chart: region vs avg_churn_rate
# 2️⃣ Pie Chart: subscription_type vs avg_churn_rate
# 3️⃣ Scatter Plot: avg_watch_time_per_day vs completion_rate (color by churn_flag)
# 4️⃣ Table View: churn_behavior (compare churned vs active users)


region,avg_churn_rate,avg_watch_time_per_day,avg_completion_rate,unique_users
West Bengal,0.0,101.23,68.5,616
Karnataka,0.0,107.4,70.23,629
Tamil Nadu,0.0,106.81,70.88,624
Uttar Pradesh,0.0,104.06,70.11,612
Maharashtra,0.0,106.64,69.14,601
Delhi,0.0,105.8,70.35,622
Kerala,0.0,105.47,69.54,635
Bihar,0.0,105.75,69.01,661


subscription_type,avg_churn_rate,avg_watch_time_per_day,avg_completion_rate
Standard,0.0,105.76,70.15
Premium,0.0,105.73,69.29
Basic,0.0,104.71,69.7


churn_flag,avg_watch_time_per_day,avg_completion_rate,avg_stream_quality,avg_network_speed,users
0,105.4,69.72,1114.7,52.3,5000


✅ Churn summary written successfully to: ott_churn_predictions
