In [0]:
try:
    import prophet
    print("Prophet already installed.")
except ImportError:
    print("Prophet not found. Installing...")
    %pip install prophet

Prophet already installed.


In [0]:
# ============================================================
# CONFIGURATION & IMPORTS
# ============================================================

import warnings
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *
from prophet import Prophet

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from prophet.plot import plot_plotly, plot_components_plotly

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("All libraries imported successfully")
print("Configuration complete")

All libraries imported successfully
Configuration complete


---
## 1. Data Loading & Validation

In [0]:
# ============================================================
# DATA LOADING
# ============================================================

print("Loading datasets...")

# Campaign Related
campaign_desc = spark.table("campaign_desc")
campaign_table = spark.table("campaign_table")

# Coupon Related
coupon_redempt = spark.table("coupon_redempt")

# Transaction Related
transaction_data = spark.table("transaction_data")

print(f"Campaign Descriptions: {campaign_desc.count():,} rows")
print(f"Campaign Table: {campaign_table.count():,} rows")
print(f"Coupon Redemptions: {coupon_redempt.count():,} rows")
print(f"Transaction Data: {transaction_data.count():,} rows")

Loading datasets...
Campaign Descriptions: 30 rows
Campaign Table: 7,208 rows
Coupon Redemptions: 2,318 rows
Transaction Data: 2,595,732 rows


In [0]:
# ============================================================
# DATA QUALITY VALIDATION
# ============================================================

print("\n" + "="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Clean transaction data - remove anomalies
initial_count = transaction_data.count()

transaction_data = transaction_data.filter(
    (col("QUANTITY") != 0) &
    (col("RETAIL_DISC") <= 0)
)

cleaned_count = transaction_data.count()
removed = initial_count - cleaned_count

print(f"Transactions cleaned: {removed:,} invalid records removed")
print(f"Valid transactions: {cleaned_count:,} rows")

# Validate campaign dates
invalid_dates = campaign_desc.filter(col("START_DAY") > col("END_DAY")).count()
if invalid_dates == 0:
    print("Campaign dates validated: All START_DAY <= END_DAY")
else:
    print(f"Warning: {invalid_dates} campaigns with invalid date ranges")

print("\n Data quality validation complete")


DATA QUALITY CHECKS
Transactions cleaned: 14,472 invalid records removed
Valid transactions: 2,581,260 rows
Campaign dates validated: All START_DAY <= END_DAY

 Data quality validation complete


---
## 2. Feature Engineering

In [0]:
# ============================================================
# CAMPAIGN ENRICHMENT
# ============================================================

print("Building campaign enrichment...")

# Combine campaign table with campaign descriptions
combined_campaign = (
    campaign_table.alias("t")
    .join(
        campaign_desc.select("CAMPAIGN", "START_DAY", "END_DAY").alias("d"),
        on="CAMPAIGN",
        how="left"
    )
)

print(f"Combined campaign data: {combined_campaign.count():,} rows")

Building campaign enrichment...
Combined campaign data: 7,208 rows


In [0]:
# ============================================================
# TRANSACTION AGGREGATION - DAILY HOUSEHOLD LEVEL
# ============================================================

print("Aggregating transactions to daily household level...")

t = transaction_data.alias("t")
c = combined_campaign.alias("c")

# Join transactions with campaigns
joined = (
    t.join(
        c,
        (col("t.household_key") == col("c.household_key")) &
        (col("t.DAY") >= col("c.START_DAY")) &
        (col("t.DAY") <= col("c.END_DAY")),
        how="left"
    )
    .select(
        col("t.household_key").alias("household_key"),
        col("t.DAY").alias("DAY"),
        col("t.QUANTITY").alias("QUANTITY"),
        col("t.SALES_VALUE").alias("SALES_VALUE"),
        col("t.RETAIL_DISC").alias("RETAIL_DISC"),
        col("t.COUPON_DISC").alias("COUPON_DISC"),
        col("c.CAMPAIGN").alias("CAMPAIGN")
    )
)

# Aggregate to daily household level with campaign flag
transaction_daily = (
    joined
    .groupBy("household_key", "DAY")
    .agg(
        first("QUANTITY").alias("QUANTITY"),
        first("SALES_VALUE").alias("SALES_VALUE"),
        first("RETAIL_DISC").alias("RETAIL_DISC"),
        first("COUPON_DISC").alias("COUPON_DISC"),
        max(when(col("CAMPAIGN").isNotNull(), 1).otherwise(0)).alias("campaign_flag")
    )
)

print(f"Daily household transactions: {transaction_daily.count():,} rows")

Aggregating transactions to daily household level...
Daily household transactions: 225,344 rows


In [0]:
# ============================================================
# ADD COUPON REDEMPTION FLAGS
# ============================================================

print("Adding coupon redemption flags...")

transaction_daily = (
    transaction_daily.alias("t")
    .join(
        coupon_redempt
            .select("household_key", "DAY")
            .dropDuplicates()
            .withColumn("coupon_redempt_flag", lit(1))
            .alias("c"),
        on=["household_key", "DAY"],
        how="left"
    )
    .withColumn(
        "coupon_redempt_flag",
        coalesce(col("coupon_redempt_flag"), lit(0))
    )
)

print(f"Transaction daily with coupon flags: {transaction_daily.count():,} rows")

Adding coupon redemption flags...
Transaction daily with coupon flags: 225,344 rows


In [0]:
# ============================================================
# BUSINESS DAILY AGGREGATION
# ============================================================

print("Aggregating to business daily metrics...")

business_daily = (
    transaction_daily
    .groupBy("DAY")
    .agg(
        round(sum("SALES_VALUE"), 2).alias("total_sales"),
        round(sum("RETAIL_DISC"), 2).alias("total_retail_disc"),
        round(sum("COUPON_DISC"), 2).alias("total_coupon_disc"),
        countDistinct(
            when(col("campaign_flag") == 1, col("household_key"))
        ).alias("campaign_count"),
        countDistinct(
            when(col("coupon_redempt_flag") == 1, col("household_key"))
        ).alias("coupon_redempt_count"),
    )
    .fillna(0)
    .orderBy("DAY")
)

print(f"Business daily metrics: {business_daily.count():,} days")
print("\n Feature engineering complete")

Aggregating to business daily metrics...
Business daily metrics: 711 days

 Feature engineering complete


---
## 3. Anomaly Detection - Sales

In [0]:
# ============================================================
# PROPHET MODEL - SALES ANOMALY DETECTION
# ============================================================

print("\n" + "="*60)
print("SALES ANOMALY DETECTION")
print("="*60)

# Prepare data for Prophet
sales_pd = business_daily.select("DAY", "total_sales").orderBy("DAY").toPandas()
sales_pd["ds"] = pd.to_datetime(sales_pd["DAY"], unit="D", origin="1960-01-01")
sales_pd["y"] = sales_pd["total_sales"]
sales_pd = sales_pd[["ds", "y"]]

print(f"Training Prophet model on {len(sales_pd)} days of sales data...")

# Train Prophet model
m_sales = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    interval_width=0.95
)
m_sales.fit(sales_pd)

# Generate forecast
future_sales = m_sales.make_future_dataframe(periods=30)
forecast_sales = m_sales.predict(future_sales)

# Detect anomalies
merged_sales = sales_pd.merge(
    forecast_sales[["ds", "yhat", "yhat_lower", "yhat_upper"]],
    on="ds",
    how="left"
)

merged_sales["is_anomaly"] = (
    (merged_sales["y"] > merged_sales["yhat_upper"]) | 
    (merged_sales["y"] < merged_sales["yhat_lower"])
)

# Separate upper and lower bound anomalies
merged_sales["anomaly_upper"] = merged_sales["y"] > merged_sales["yhat_upper"]
merged_sales["anomaly_lower"] = merged_sales["y"] < merged_sales["yhat_lower"]

# Count anomalies
num_sales_anomalies = merged_sales["is_anomaly"].sum()
num_sales_upper = merged_sales["anomaly_upper"].sum()
num_sales_lower = merged_sales["anomaly_lower"].sum()
pct_sales_anomalies = (num_sales_anomalies / len(merged_sales)) * 100

print(f"\n Sales anomalies detected: {num_sales_anomalies} days ({pct_sales_anomalies:.2f}%)")
print(f"  - Above upper bound: {num_sales_upper} days")
print(f"  - Below lower bound: {num_sales_lower} days")

# Store results
anomaly_sales_pd = merged_sales[merged_sales["is_anomaly"] == True][["ds", "y", "yhat_upper", "yhat_lower", "anomaly_upper", "anomaly_lower"]].copy()
anomaly_sales_pd["DAY"] = (anomaly_sales_pd["ds"] - pd.Timestamp("1960-01-01")).dt.days


SALES ANOMALY DETECTION


08:49:11 - cmdstanpy - INFO - Chain [1] start processing
08:49:11 - cmdstanpy - INFO - Chain [1] done processing


Training Prophet model on 711 days of sales data...

 Sales anomalies detected: 34 days (4.78%)
  - Above upper bound: 22 days
  - Below lower bound: 12 days


---
## 4. Anomaly Detection - Retail Discount

In [0]:
# ============================================================
# PROPHET MODEL - RETAIL DISCOUNT ANOMALY DETECTION
# ============================================================

print("\n" + "="*60)
print("RETAIL DISCOUNT ANOMALY DETECTION")
print("="*60)

# Prepare data
retail_pd = (
    business_daily
    .select("DAY", "total_retail_disc", "campaign_count")
    .orderBy("DAY")
    .toPandas()
)

retail_pd["ds"] = pd.to_datetime(retail_pd["DAY"], unit="D", origin="1960-01-01")
retail_pd["y"] = retail_pd["total_retail_disc"].abs()
retail_pd["campaign_count"] = retail_pd["campaign_count"]
retail_pd = retail_pd[["ds", "y", "campaign_count"]]

print(f"Training Prophet model on {len(retail_pd)} days of retail discount data...")

# Train model with campaign_count as regressor
m_retail = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    interval_width=0.95
)
m_retail.add_regressor("campaign_count")
m_retail.fit(retail_pd)

# Forecast
future_retail = m_retail.make_future_dataframe(periods=30)
last_campaign = retail_pd["campaign_count"].iloc[-1]
future_retail["campaign_count"] = last_campaign
forecast_retail = m_retail.predict(future_retail)

# Detect anomalies
merged_retail = retail_pd.merge(
    forecast_retail[["ds", "yhat", "yhat_lower", "yhat_upper"]],
    on="ds",
    how="left"
)

merged_retail["is_anomaly"] = (
    (merged_retail["y"] > merged_retail["yhat_upper"]) |
    (merged_retail["y"] < merged_retail["yhat_lower"])
)

# Separate upper and lower bound anomalies
merged_retail["anomaly_upper"] = merged_retail["y"] > merged_retail["yhat_upper"]
merged_retail["anomaly_lower"] = merged_retail["y"] < merged_retail["yhat_lower"]

# Count anomalies
num_retail_anomalies = merged_retail["is_anomaly"].sum()
num_retail_upper = merged_retail["anomaly_upper"].sum()
num_retail_lower = merged_retail["anomaly_lower"].sum()
pct_retail_anomalies = (num_retail_anomalies / len(merged_retail)) * 100

print(f"\n Retail discount anomalies detected: {num_retail_anomalies} days ({pct_retail_anomalies:.2f}%)")
print(f"  - Above upper bound: {num_retail_upper} days")
print(f"  - Below lower bound: {num_retail_lower} days")

# Store results
retail_anom_pd = merged_retail[merged_retail["is_anomaly"] == True][["ds", "y", "yhat_upper", "yhat_lower", "anomaly_upper", "anomaly_lower"]].copy()
retail_anom_pd["DAY"] = (retail_anom_pd["ds"] - pd.Timestamp("1960-01-01")).dt.days


RETAIL DISCOUNT ANOMALY DETECTION


08:49:13 - cmdstanpy - INFO - Chain [1] start processing
08:49:13 - cmdstanpy - INFO - Chain [1] done processing


Training Prophet model on 711 days of retail discount data...

 Retail discount anomalies detected: 28 days (3.94%)
  - Above upper bound: 24 days
  - Below lower bound: 4 days


---
## 5. Anomaly Detection - Coupon Discount

In [0]:
# ============================================================
# PROPHET MODEL - COUPON DISCOUNT ANOMALY DETECTION
# ============================================================

print("\n" + "="*60)
print("COUPON DISCOUNT ANOMALY DETECTION")
print("="*60)

# Prepare data
coupon_pd = (
    business_daily
    .select("DAY", "total_coupon_disc", "campaign_count", "coupon_redempt_count")
    .orderBy("DAY")
    .toPandas()
)

coupon_pd["ds"] = pd.to_datetime(coupon_pd["DAY"], unit="D", origin="1960-01-01")
coupon_pd["y"] = coupon_pd["total_coupon_disc"].abs()
coupon_pd["campaign_count"] = coupon_pd["campaign_count"]
coupon_pd["coupon_redempt_count"] = coupon_pd["coupon_redempt_count"]
coupon_pd = coupon_pd[["ds", "y", "campaign_count", "coupon_redempt_count"]]

print(f"Training Prophet model on {len(coupon_pd)} days of coupon discount data...")

# Train model with regressors
m_coupon = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    interval_width=0.95
)
m_coupon.add_regressor("campaign_count")
m_coupon.add_regressor("coupon_redempt_count")
m_coupon.fit(coupon_pd)

# Forecast
future_coupon = m_coupon.make_future_dataframe(periods=30)
future_coupon["campaign_count"] = coupon_pd["campaign_count"].iloc[-1]
future_coupon["coupon_redempt_count"] = coupon_pd["coupon_redempt_count"].iloc[-1]
forecast_coupon = m_coupon.predict(future_coupon)

# Detect anomalies
merged_coupon = coupon_pd.merge(
    forecast_coupon[["ds", "yhat", "yhat_lower", "yhat_upper"]],
    on="ds",
    how="left"
)

merged_coupon["is_anomaly"] = (
    (merged_coupon["y"] > merged_coupon["yhat_upper"]) |
    (merged_coupon["y"] < merged_coupon["yhat_lower"])
)

# Separate upper and lower bound anomalies
merged_coupon["anomaly_upper"] = merged_coupon["y"] > merged_coupon["yhat_upper"]
merged_coupon["anomaly_lower"] = merged_coupon["y"] < merged_coupon["yhat_lower"]

# Count anomalies
num_coupon_anomalies = merged_coupon["is_anomaly"].sum()
num_coupon_upper = merged_coupon["anomaly_upper"].sum()
num_coupon_lower = merged_coupon["anomaly_lower"].sum()
pct_coupon_anomalies = (num_coupon_anomalies / len(merged_coupon)) * 100

print(f"\n Coupon discount anomalies detected: {num_coupon_anomalies} days ({pct_coupon_anomalies:.2f}%)")
print(f"  - Above upper bound: {num_coupon_upper} days")
print(f"  - Below lower bound: {num_coupon_lower} days")

# Store results
coupon_anom_pd = merged_coupon[merged_coupon["is_anomaly"] == True][["ds", "y", "yhat_upper", "yhat_lower", "anomaly_upper", "anomaly_lower"]].copy()
coupon_anom_pd["DAY"] = (coupon_anom_pd["ds"] - pd.Timestamp("1960-01-01")).dt.days


COUPON DISCOUNT ANOMALY DETECTION


08:49:15 - cmdstanpy - INFO - Chain [1] start processing
08:49:15 - cmdstanpy - INFO - Chain [1] done processing


Training Prophet model on 711 days of coupon discount data...

 Coupon discount anomalies detected: 39 days (5.49%)
  - Above upper bound: 39 days
  - Below lower bound: 0 days


---
## 6. Executive Summary

In [0]:
# ============================================================
# EXECUTIVE SUMMARY - KEY METRICS
# ============================================================

print("\n" + "="*60)
print("EXECUTIVE SUMMARY - ANOMALY DETECTION REPORT")
print("="*60)

# Calculate summary statistics
total_days = len(sales_pd)
date_range_start = sales_pd["ds"].min().strftime("%Y-%m-%d")
date_range_end = sales_pd["ds"].max().strftime("%Y-%m-%d")

# Sales metrics
avg_daily_sales = merged_sales["y"].mean()
total_sales = merged_sales["y"].sum()
sales_anomaly_days = merged_sales[merged_sales["is_anomaly"] == True]

# Retail discount metrics
avg_retail_disc = merged_retail["y"].mean()
total_retail_disc = merged_retail["y"].sum()
retail_anomaly_days = merged_retail[merged_retail["is_anomaly"] == True]

# Coupon discount metrics
avg_coupon_disc = merged_coupon["y"].mean()
total_coupon_disc = merged_coupon["y"].sum()
coupon_anomaly_days = merged_coupon[merged_coupon["is_anomaly"] == True]

print(f"\n ANALYSIS PERIOD")
print(f"   Date Range: {date_range_start} to {date_range_end}")
print(f"   Total Days Analyzed: {total_days:,}")

print(f"\n SALES METRICS")
print(f"   Total Sales: ${total_sales:,.2f}")
print(f"   Average Daily Sales: ${avg_daily_sales:,.2f}")
print(f"   Anomaly Days: {num_sales_anomalies} ({pct_sales_anomalies:.2f}%)")
print(f"      - Above upper bound: {num_sales_upper} days")
print(f"      - Below lower bound: {num_sales_lower} days")
if num_sales_anomalies > 0:
    max_sales_anomaly = sales_anomaly_days.loc[sales_anomaly_days["y"].idxmax()]
    min_sales_anomaly = sales_anomaly_days.loc[sales_anomaly_days["y"].idxmin()]
    print(f"   Highest Anomaly: ${max_sales_anomaly['y']:,.2f} on {max_sales_anomaly['ds'].strftime('%Y-%m-%d')}")
    print(f"   Lowest Anomaly: ${min_sales_anomaly['y']:,.2f} on {min_sales_anomaly['ds'].strftime('%Y-%m-%d')}")

print(f"\n RETAIL DISCOUNT METRICS")
print(f"   Total Retail Discounts: ${total_retail_disc:,.2f}")
print(f"   Average Daily Discount: ${avg_retail_disc:,.2f}")
print(f"   Anomaly Days: {num_retail_anomalies} ({pct_retail_anomalies:.2f}%)")
print(f"      - Above upper bound: {num_retail_upper} days")
print(f"      - Below lower bound: {num_retail_lower} days")
if num_retail_anomalies > 0:
    max_retail_anomaly = retail_anomaly_days.loc[retail_anomaly_days["y"].idxmax()]
    print(f"   Highest Anomaly: ${max_retail_anomaly['y']:,.2f} on {max_retail_anomaly['ds'].strftime('%Y-%m-%d')}")

print(f"\n COUPON DISCOUNT METRICS")
print(f"   Total Coupon Discounts: ${total_coupon_disc:,.2f}")
print(f"   Average Daily Discount: ${avg_coupon_disc:,.2f}")
print(f"   Anomaly Days: {num_coupon_anomalies} ({pct_coupon_anomalies:.2f}%)")
print(f"      - Above upper bound: {num_coupon_upper} days")
print(f"      - Below lower bound: {num_coupon_lower} days")
if num_coupon_anomalies > 0:
    max_coupon_anomaly = coupon_anomaly_days.loc[coupon_anomaly_days["y"].idxmax()]
    print(f"   Highest Anomaly: ${max_coupon_anomaly['y']:,.2f} on {max_coupon_anomaly['ds'].strftime('%Y-%m-%d')}")

print("\n" + "="*60)
print("Executive Summary Complete")
print("="*60)


EXECUTIVE SUMMARY - ANOMALY DETECTION REPORT

 ANALYSIS PERIOD
   Date Range: 1960-01-02 to 1961-12-12
   Total Days Analyzed: 711

 SALES METRICS
   Total Sales: $1,099,515.60
   Average Daily Sales: $1,546.44
   Anomaly Days: 34 (4.78%)
      - Above upper bound: 22 days
      - Below lower bound: 12 days
   Highest Anomaly: $2,482.58 on 1961-01-31
   Lowest Anomaly: $2.00 on 1961-10-05

 RETAIL DISCOUNT METRICS
   Total Retail Discounts: $125,347.98
   Average Daily Discount: $176.30
   Anomaly Days: 28 (3.94%)
      - Above upper bound: 24 days
      - Below lower bound: 4 days
   Highest Anomaly: $344.46 on 1960-09-03

 COUPON DISCOUNT METRICS
   Total Coupon Discounts: $1,939.18
   Average Daily Discount: $2.73
   Anomaly Days: 39 (5.49%)
      - Above upper bound: 39 days
      - Below lower bound: 0 days
   Highest Anomaly: $20.74 on 1960-05-08

Executive Summary Complete


---
## 7. Visualizations - Sales Anomalies

In [0]:
# ============================================================
# VISUALIZATION 1: SALES FORECAST WITH ANOMALIES
# ============================================================

fig_sales = plot_plotly(m_sales, forecast_sales)
fig_sales.update_layout(
    title="Daily Sales Forecast with Confidence Intervals",
    height=600,
    showlegend=True
)
fig_sales.show()

In [0]:
# ============================================================
# VISUALIZATION 2: SALES ANOMALIES HIGHLIGHTED
# ============================================================

fig_sales_scatter = px.scatter(
    merged_sales,
    x="ds",
    y="y",
    color="is_anomaly",
    color_discrete_map={False: "blue", True: "red"},
    title="Sales Anomalies - Red Points Indicate Outliers",
    labels={"ds": "Date", "y": "Total Daily Sales ($)"},
    height=500
)

fig_sales_scatter.add_trace(
    go.Scatter(
        x=merged_sales["ds"],
        y=merged_sales["yhat_upper"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Upper Bound"
    )
)

fig_sales_scatter.add_trace(
    go.Scatter(
        x=merged_sales["ds"],
        y=merged_sales["yhat_lower"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Lower Bound"
    )
)

fig_sales_scatter.update_layout(
    xaxis_title="Date",
    yaxis_title="Sales ($)",
    hovermode="x unified"
)

fig_sales_scatter.show()

In [0]:
# ============================================================
# VISUALIZATION 3: SALES TREND COMPONENTS
# ============================================================

fig_sales_components = plot_components_plotly(m_sales, forecast_sales)
fig_sales_components.update_layout(
    title="Sales Trend Decomposition - Yearly & Weekly Patterns",
    height=600
)
fig_sales_components.show()

In [0]:
# ============================================================
# VISUALIZATION 4: CAMPAIGNS AND REDEMPTIONS PER DAY
# ============================================================

# Prepare data
campaigns_redemptions_pd = business_daily.select("DAY", "campaign_count", "coupon_redempt_count").orderBy("DAY").toPandas()
campaigns_redemptions_pd["ds"] = pd.to_datetime(campaigns_redemptions_pd["DAY"], unit="D", origin="1960-01-01")

# Create dual-axis plot
fig_campaigns = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Daily Campaign Participation", "Daily Coupon Redemptions"),
    vertical_spacing=0.12
)

# Campaign count
fig_campaigns.add_trace(
    go.Scatter(
        x=campaigns_redemptions_pd["ds"],
        y=campaigns_redemptions_pd["campaign_count"],
        mode="lines",
        name="Campaign Count",
        line=dict(color="blue", width=1.5),
        fill="tozeroy",
        fillcolor="rgba(0, 100, 255, 0.2)"
    ),
    row=1, col=1
)

# Coupon redemptions
fig_campaigns.add_trace(
    go.Scatter(
        x=campaigns_redemptions_pd["ds"],
        y=campaigns_redemptions_pd["coupon_redempt_count"],
        mode="lines",
        name="Redemption Count",
        line=dict(color="purple", width=1.5),
        fill="tozeroy",
        fillcolor="rgba(128, 0, 128, 0.2)"
    ),
    row=2, col=1
)

fig_campaigns.update_xaxes(title_text="Date", row=2, col=1)
fig_campaigns.update_yaxes(title_text="# of Households", row=1, col=1)
fig_campaigns.update_yaxes(title_text="# of Households", row=2, col=1)

fig_campaigns.update_layout(
    title_text="Campaign Participation & Coupon Redemption Trends",
    height=700,
    showlegend=True,
    hovermode="x unified"
)

fig_campaigns.show()

---
## 8. Visualizations - Retail Discount Anomalies

In [0]:
# ============================================================
# VISUALIZATION 4: RETAIL DISCOUNT FORECAST
# ============================================================

fig_retail = plot_plotly(m_retail, forecast_retail)
fig_retail.update_layout(
    title="Daily Retail Discount Forecast with Confidence Intervals",
    height=600
)
fig_retail.show()

In [0]:
# ============================================================
# VISUALIZATION 5: RETAIL DISCOUNT ANOMALIES HIGHLIGHTED
# ============================================================

fig_retail_scatter = px.scatter(
    merged_retail,
    x="ds",
    y="y",
    color="is_anomaly",
    color_discrete_map={False: "green", True: "red"},
    title="Retail Discount Anomalies - Red Points Indicate Outliers",
    labels={"ds": "Date", "y": "Daily Retail Discount ($)"},
    height=500
)

fig_retail_scatter.add_trace(
    go.Scatter(
        x=merged_retail["ds"],
        y=merged_retail["yhat_upper"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Upper Bound"
    )
)

fig_retail_scatter.add_trace(
    go.Scatter(
        x=merged_retail["ds"],
        y=merged_retail["yhat_lower"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Lower Bound"
    )
)

fig_retail_scatter.update_layout(
    xaxis_title="Date",
    yaxis_title="Retail Discount ($)",
    hovermode="x unified"
)

fig_retail_scatter.show()

---
## 9. Visualizations - Coupon Discount Anomalies

In [0]:
# ============================================================
# VISUALIZATION 6: COUPON DISCOUNT FORECAST
# ============================================================

fig_coupon = plot_plotly(m_coupon, forecast_coupon)
fig_coupon.update_layout(
    title="Daily Coupon Discount Forecast with Confidence Intervals",
    height=600
)
fig_coupon.show()

In [0]:
# ============================================================
# VISUALIZATION 7: COUPON DISCOUNT ANOMALIES HIGHLIGHTED
# ============================================================

fig_coupon_scatter = px.scatter(
    merged_coupon,
    x="ds",
    y="y",
    color="is_anomaly",
    color_discrete_map={False: "purple", True: "red"},
    title="Coupon Discount Anomalies - Red Points Indicate Outliers",
    labels={"ds": "Date", "y": "Daily Coupon Discount ($)"},
    height=500
)

fig_coupon_scatter.add_trace(
    go.Scatter(
        x=merged_coupon["ds"],
        y=merged_coupon["yhat_upper"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Upper Bound"
    )
)

fig_coupon_scatter.add_trace(
    go.Scatter(
        x=merged_coupon["ds"],
        y=merged_coupon["yhat_lower"],
        mode="lines",
        line=dict(dash="dash", color="gray"),
        name="Lower Bound"
    )
)

fig_coupon_scatter.update_layout(
    xaxis_title="Date",
    yaxis_title="Coupon Discount ($)",
    hovermode="x unified"
)

fig_coupon_scatter.show()

---
## 10. Combined Anomaly Overview

In [0]:
# ============================================================
# VISUALIZATION 8: COMBINED ANOMALY HEATMAP
# ============================================================

# Create combined anomaly dataset
all_dates = pd.DataFrame({"ds": merged_sales["ds"]})

all_dates["sales_anomaly"] = merged_sales["is_anomaly"].astype(int)
all_dates["retail_anomaly"] = merged_retail["is_anomaly"].astype(int)
all_dates["coupon_anomaly"] = merged_coupon["is_anomaly"].astype(int)
all_dates["total_anomalies"] = all_dates["sales_anomaly"] + all_dates["retail_anomaly"] + all_dates["coupon_anomaly"]

# Multi-anomaly days (days with 2+ anomalies)
multi_anomaly_days = all_dates[all_dates["total_anomalies"] >= 2]

print(f"\n COMBINED ANOMALY INSIGHTS")
print(f"   Days with multiple anomalies: {len(multi_anomaly_days)}")
if len(multi_anomaly_days) > 0:
    print(f"   Dates with multiple anomalies:")
    for _, row in multi_anomaly_days.head(10).iterrows():
        anomaly_types = []
        if row["sales_anomaly"]: anomaly_types.append("Sales")
        if row["retail_anomaly"]: anomaly_types.append("Retail")
        if row["coupon_anomaly"]: anomaly_types.append("Coupon")
        print(f"      {row['ds'].strftime('%Y-%m-%d')}: {', '.join(anomaly_types)}")

# Heatmap visualization
fig_heatmap = go.Figure()

fig_heatmap.add_trace(go.Scatter(
    x=all_dates["ds"],
    y=[1]*len(all_dates),
    mode="markers",
    marker=dict(
        size=10,
        color=all_dates["sales_anomaly"],
        colorscale=[[0, "lightgray"], [1, "red"]],
        showscale=False
    ),
    name="Sales Anomalies",
    hovertemplate="Date: %{x}<br>Sales Anomaly: %{marker.color}<extra></extra>"
))

fig_heatmap.add_trace(go.Scatter(
    x=all_dates["ds"],
    y=[2]*len(all_dates),
    mode="markers",
    marker=dict(
        size=10,
        color=all_dates["retail_anomaly"],
        colorscale=[[0, "lightgray"], [1, "red"]],
        showscale=False
    ),
    name="Retail Anomalies",
    hovertemplate="Date: %{x}<br>Retail Anomaly: %{marker.color}<extra></extra>"
))

fig_heatmap.add_trace(go.Scatter(
    x=all_dates["ds"],
    y=[3]*len(all_dates),
    mode="markers",
    marker=dict(
        size=10,
        color=all_dates["coupon_anomaly"],
        colorscale=[[0, "lightgray"], [1, "red"]],
        showscale=False
    ),
    name="Coupon Anomalies",
    hovertemplate="Date: %{x}<br>Coupon Anomaly: %{marker.color}<extra></extra>"
))

fig_heatmap.update_layout(
    title="Anomaly Timeline - All Metrics (Red = Anomaly Detected)",
    xaxis_title="Date",
    yaxis=dict(
        tickvals=[1, 2, 3],
        ticktext=["Sales", "Retail Discount", "Coupon Discount"]
    ),
    height=400,
    showlegend=True
)

fig_heatmap.show()


 COMBINED ANOMALY INSIGHTS
   Days with multiple anomalies: 16
   Dates with multiple anomalies:
      1960-06-12: Sales, Retail
      1960-09-03: Sales, Retail
      1960-09-19: Sales, Retail
      1960-10-03: Sales, Retail
      1960-10-05: Sales, Retail
      1960-11-15: Retail, Coupon
      1961-03-06: Retail, Coupon
      1961-03-09: Retail, Coupon
      1961-03-13: Sales, Retail
      1961-04-14: Retail, Coupon


In [0]:
# ============================================================
# VISUALIZATION 9: ANOMALY SUMMARY DASHBOARD
# ============================================================

# Create summary bar chart
summary_data = pd.DataFrame({
    "Metric": ["Sales", "Retail Discount", "Coupon Discount"],
    "Anomaly Count": [num_sales_anomalies, num_retail_anomalies, num_coupon_anomalies],
    "Anomaly %": [pct_sales_anomalies, pct_retail_anomalies, pct_coupon_anomalies]
})

fig_summary = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Anomaly Count by Metric", "Anomaly Percentage by Metric"),
    specs=[[{"type": "bar"}, {"type": "bar"}]]
)

fig_summary.add_trace(
    go.Bar(
        x=summary_data["Metric"],
        y=summary_data["Anomaly Count"],
        marker_color=["#1f77b4", "#2ca02c", "#9467bd"],
        text=summary_data["Anomaly Count"],
        textposition="auto"
    ),
    row=1, col=1
)

fig_summary.add_trace(
    go.Bar(
        x=summary_data["Metric"],
        y=summary_data["Anomaly %"],
        marker_color=["#1f77b4", "#2ca02c", "#9467bd"],
        text=[f"{x:.2f}%" for x in summary_data["Anomaly %"]],
        textposition="auto"
    ),
    row=1, col=2
)

fig_summary.update_xaxes(title_text="Metric", row=1, col=1)
fig_summary.update_xaxes(title_text="Metric", row=1, col=2)
fig_summary.update_yaxes(title_text="Number of Days", row=1, col=1)
fig_summary.update_yaxes(title_text="Percentage (%)", row=1, col=2)

fig_summary.update_layout(
    title_text="Anomaly Detection Summary Dashboard",
    showlegend=False,
    height=400
)

fig_summary.show()

In [0]:
# ============================================================
# VISUALIZATION 10: UPPER vs LOWER BOUND ANOMALY BREAKDOWN
# ============================================================

# Create detailed breakdown data
breakdown_data = pd.DataFrame({
    "Metric": ["Sales", "Sales", "Retail Disc.", "Retail Disc.", "Coupon Disc.", "Coupon Disc."],
    "Type": ["Above Upper", "Below Lower", "Above Upper", "Below Lower", "Above Upper", "Below Lower"],
    "Count": [num_sales_upper, num_sales_lower, num_retail_upper, num_retail_lower, num_coupon_upper, num_coupon_lower]
})

fig_breakdown = px.bar(
    breakdown_data,
    x="Metric",
    y="Count",
    color="Type",
    barmode="group",
    title="Anomaly Breakdown: Upper vs Lower Bound Violations",
    labels={"Count": "Number of Days", "Metric": "Metric Type"},
    color_discrete_map={"Above Upper": "#FF6B6B", "Below Lower": "#4ECDC4"},
    text="Count",
    height=500
)

fig_breakdown.update_traces(textposition="outside")
fig_breakdown.update_layout(
    showlegend=True,
    legend_title="Anomaly Type",
    xaxis_title="Metric",
    yaxis_title="Number of Anomalous Days"
)

fig_breakdown.show()

# Print detailed statistics
print("\n" + "="*60)
print("UPPER vs LOWER BOUND ANOMALY STATISTICS")
print("="*60)

print(f"\n SALES:")
print(f"   Above upper bound: {num_sales_upper} days ({(num_sales_upper/len(merged_sales)*100):.2f}%)")
print(f"   Below lower bound: {num_sales_lower} days ({(num_sales_lower/len(merged_sales)*100):.2f}%)")

print(f"\n RETAIL DISCOUNT:")
print(f"   Above upper bound: {num_retail_upper} days ({(num_retail_upper/len(merged_retail)*100):.2f}%)")
print(f"   Below lower bound: {num_retail_lower} days ({(num_retail_lower/len(merged_retail)*100):.2f}%)")

print(f"\n COUPON DISCOUNT:")
print(f"   Above upper bound: {num_coupon_upper} days ({(num_coupon_upper/len(merged_coupon)*100):.2f}%)")
print(f"   Below lower bound: {num_coupon_lower} days ({(num_coupon_lower/len(merged_coupon)*100):.2f}%)")

print("\n" + "="*60)


UPPER vs LOWER BOUND ANOMALY STATISTICS

 SALES:
   Above upper bound: 22 days (3.09%)
   Below lower bound: 12 days (1.69%)

 RETAIL DISCOUNT:
   Above upper bound: 24 days (3.38%)
   Below lower bound: 4 days (0.56%)

 COUPON DISCOUNT:
   Above upper bound: 39 days (5.49%)
   Below lower bound: 0 days (0.00%)



---
## 11. Anomaly Data Export

In [0]:
# ============================================================
# EXPORT ANOMALY DETAILS
# ============================================================

print("\n" + "="*60)
print("ANOMALY DETAILS - EXPORTABLE DATAFRAMES")
print("="*60)

# Sales anomalies
if len(anomaly_sales_pd) > 0:
    print(f"\n SALES ANOMALIES ({len(anomaly_sales_pd)} days)")
    print(f"   - Above upper bound: {anomaly_sales_pd['anomaly_upper'].sum()} days")
    print(f"   - Below lower bound: {anomaly_sales_pd['anomaly_lower'].sum()} days")
    print("\n   Details:")
    display_sales = anomaly_sales_pd.copy()
    display_sales["anomaly_type"] = display_sales.apply(
        lambda row: "Above Upper" if row["anomaly_upper"] else "Below Lower", axis=1
    )
    print(display_sales[["ds", "y", "yhat_upper", "yhat_lower", "anomaly_type"]].to_string(index=False))
else:
    print("\n SALES ANOMALIES: None detected")

# Retail discount anomalies
if len(retail_anom_pd) > 0:
    print(f"\n RETAIL DISCOUNT ANOMALIES ({len(retail_anom_pd)} days)")
    print(f"   - Above upper bound: {retail_anom_pd['anomaly_upper'].sum()} days")
    print(f"   - Below lower bound: {retail_anom_pd['anomaly_lower'].sum()} days")
    print("\n   Details:")
    display_retail = retail_anom_pd.copy()
    display_retail["anomaly_type"] = display_retail.apply(
        lambda row: "Above Upper" if row["anomaly_upper"] else "Below Lower", axis=1
    )
    print(display_retail[["ds", "y", "yhat_upper", "yhat_lower", "anomaly_type"]].to_string(index=False))
else:
    print("\n RETAIL DISCOUNT ANOMALIES: None detected")

# Coupon discount anomalies
if len(coupon_anom_pd) > 0:
    print(f"\n COUPON DISCOUNT ANOMALIES ({len(coupon_anom_pd)} days)")
    print(f"   - Above upper bound: {coupon_anom_pd['anomaly_upper'].sum()} days")
    print(f"   - Below lower bound: {coupon_anom_pd['anomaly_lower'].sum()} days")
    print("\n   Details:")
    display_coupon = coupon_anom_pd.copy()
    display_coupon["anomaly_type"] = display_coupon.apply(
        lambda row: "Above Upper" if row["anomaly_upper"] else "Below Lower", axis=1
    )
    print(display_coupon[["ds", "y", "yhat_upper", "yhat_lower", "anomaly_type"]].to_string(index=False))
else:
    print("\n COUPON DISCOUNT ANOMALIES: None detected")

print("\n" + "="*60)
print("Anomaly Detection Report Complete")
print("="*60)

# Convert to Spark DataFrames for database storage
# anomaly_sales_spark = spark.createDataFrame(anomaly_sales_pd[["DAY", "y", "yhat_upper", "yhat_lower"]])
# retail_anom_spark = spark.createDataFrame(retail_anom_pd[["DAY", "y", "yhat_upper", "yhat_lower"]])
# coupon_anom_spark = spark.createDataFrame(coupon_anom_pd[["DAY", "y", "yhat_upper", "yhat_lower"]])


ANOMALY DETAILS - EXPORTABLE DATAFRAMES

 SALES ANOMALIES (34 days)
   - Above upper bound: 22 days
   - Below lower bound: 12 days

   Details:
        ds       y  yhat_upper  yhat_lower anomaly_type
1960-05-22 2043.21 2035.636821 1171.909905  Above Upper
1960-06-12 2274.54 2140.821308 1322.514489  Above Upper
1960-07-11 2249.96 2156.557854 1315.375553  Above Upper
1960-09-03 2296.01 1990.849229 1131.161662  Above Upper
1960-09-07 1163.03 2007.059821 1207.681875  Below Lower
1960-09-19 1256.74 2091.660687 1299.970469  Below Lower
1960-10-01 2123.09 1978.418638 1147.360631  Above Upper
1960-10-03 2306.31 2128.018706 1290.961036  Above Upper
1960-10-05    5.59 2031.979077 1226.650284  Below Lower
1960-10-12 1087.12 1980.757166 1141.582396  Below Lower
1960-11-24 2087.01 1968.353384 1134.437012  Above Upper
1960-12-20 1363.75 2247.486408 1412.728798  Below Lower
1961-01-31 2482.58 2247.415688 1432.096637  Above Upper
1961-02-02 2169.64 2124.831042 1296.577523  Above Upper
1961-02-21 247

---
## Summary

This production notebook has successfully:
1. Loaded and validated retail transaction data
2. Performed data quality checks and cleaning
3. Engineered features for anomaly detection
4. Trained Prophet models for Sales, Retail Discount, and Coupon Discount
5. Detected anomalies using 95% confidence intervals
6. Generated executive summary with key insights
7. Created comprehensive visualizations
8. Exported anomaly details for further analysis