# MMM Real Estate — Exploration Notebook

This notebook walks through:
1. Installing dependencies
2. Generating synthetic data
3. Exploring & visualizing the data
4. Fitting the MMM model
5. Analyzing results
6. Launching the dashboard

In [None]:
# 1. Clone repo & install dependencies (Colab)
%cd /content
!rm -rf /content/mmm-dashboard-immo
!git clone https://github.com/kofekod23/mmm-dashboard-immo.git /content/mmm-dashboard-immo
%cd /content/mmm-dashboard-immo/notebooks
!pip install -q pandas numpy plotly scikit-learn matplotlib seaborn

In [None]:
import sys
sys.path.insert(0, '/content/mmm-dashboard-immo')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from src.data_generator import generate_national_data, generate_regional_data
from src.utils import MEDIA_CHANNELS, CHANNEL_LABELS, REGIONS

## 2. Generate Data

In [None]:
national_df = generate_national_data()
regional_df = generate_regional_data(national_df)
print(f"National: {national_df.shape}, Regional: {regional_df.shape}")
national_df.head()

## 3. Explore & Visualize

In [None]:
# Leads over time
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)
axes[0].plot(national_df['date'], national_df['leads'], label='Leads')
axes[0].set_ylabel('Leads')
axes[0].legend()
axes[0].set_title('Weekly Leads & App Downloads')

axes[1].plot(national_df['date'], national_df['app_downloads'], label='Downloads', color='orange')
axes[1].set_ylabel('Downloads')
axes[1].legend()
plt.tight_layout()
plt.show()

In [None]:
# Media spend stacked area
fig = go.Figure()
for ch in MEDIA_CHANNELS:
    fig.add_trace(go.Scatter(
        x=national_df['date'], y=national_df[f'spend_{ch}'],
        name=CHANNEL_LABELS[ch], stackgroup='one'
    ))
fig.update_layout(title='Weekly Media Spend', yaxis_title='€')
fig.show()

In [None]:
# Macro variables
fig, ax1 = plt.subplots(figsize=(14, 4))
ax1.plot(national_df['date'], national_df['interest_rate_20y'], 'b-', label='Interest Rate 20y')
ax1.set_ylabel('Rate (%)', color='b')
ax2 = ax1.twinx()
ax2.plot(national_df['date'], national_df['pinel'], 'r--', label='Pinel')
ax2.set_ylabel('Pinel Coeff', color='r')
ax1.set_title('Macro Environment')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
corr_cols = ['leads', 'app_downloads'] + [f'spend_{ch}' for ch in MEDIA_CHANNELS] + ['interest_rate_20y', 'pinel']
fig = plt.figure(figsize=(10, 8))
sns.heatmap(national_df[corr_cols].corr(), annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Regional leads
region_totals = regional_df.groupby('region')['leads'].sum().sort_values()
fig = px.bar(x=region_totals.values, y=region_totals.index, orientation='h',
             title='Total Leads by Region', labels={'x': 'Leads', 'y': 'Region'})
fig.show()

## 4. Fit MMM Model

In [None]:
## Option A: Ridge fallback (fast, no GPU needed)

from src.mmm_model import fit_mmm, save_results

results = fit_mmm(national_df)

if isinstance(results, dict):
    print(f"Model R²: {results['r2']:.3f}")
    print(f"Intercept: {results['intercept']:.1f}")
    for k, v in results['coefficients'].items():
        print(f"  {k}: {v:.2f}")

In [None]:
## Option B: Bayesian MMM with PyMC-Marketing (requires Colab GPU)
## Run this cell on Google Colab with GPU runtime enabled

# ── Step 1: Install dependencies ──
!pip install -q pymc pymc-marketing arviz jax[cuda12] numpyro

import pymc as pm
import arviz as az
import json
import numpy as np
from datetime import datetime
from pathlib import Path
from pymc_marketing.mmm import MMM, GeometricAdstock, LogisticSaturation

# ── Step 2: Prepare data ──
channel_columns = [f'spend_{ch}' for ch in ['tv', 'radio', 'ratp_display', 'google_ads']]
control_columns = ['interest_rate_20y', 'pinel', 'covid_impact']

model_df = national_df[['date'] + channel_columns + control_columns + ['leads']].dropna().copy()
X = model_df.drop(columns=['leads'])
y = model_df['leads'].values.astype(float)

print(f"Data shape: X={X.shape}, y={y.shape}")
print(f"Date range: {X['date'].min()} → {X['date'].max()}")

# ── Step 3: Define and fit the Bayesian MMM ──
mmm = MMM(
    adstock=GeometricAdstock(l_max=8),
    saturation=LogisticSaturation(),
    date_column='date',
    channel_columns=channel_columns,
    control_columns=control_columns,
)

# Fit: 4 chains × 2000 draws via NumPyro on GPU (A100)
mmm.fit(X=X, y=y, target_accept=0.85, chains=4, draws=2000, tune=1000,
        nuts_sampler="numpyro")

print("Fit complete!")

# ── Step 4: Convergence diagnostics ──
summary = az.summary(mmm.fit_result)
print(summary)

r_hat_max = float(summary['r_hat'].max())
ess_bulk_min = float(summary['ess_bulk'].min())
divergences = int(mmm.idata["sample_stats"]["diverging"].sum().item())

print(f"\n=== Convergence Diagnostics ===")
print(f"r_hat max:      {r_hat_max:.4f}  {'✅' if r_hat_max < 1.05 else '❌'}")
print(f"ESS bulk min:   {ess_bulk_min:.0f}  {'✅' if ess_bulk_min > 400 else '❌'}")
print(f"Divergences:    {divergences}  {'✅' if divergences == 0 else '❌'}")

# ── Step 5: Posterior predictive ──
posterior_pred = mmm.sample_posterior_predictive(X, extend_idata=True)

y_pp = mmm.idata["posterior_predictive"]['y'].values
y_pp_flat = y_pp.reshape(-1, y_pp.shape[-1])
y_mean = y_pp_flat.mean(axis=0).tolist()
y_hdi_3 = np.percentile(y_pp_flat, 3, axis=0).tolist()
y_hdi_97 = np.percentile(y_pp_flat, 97, axis=0).tolist()

# R²
from sklearn.metrics import r2_score as sklearn_r2
r2_mean = float(sklearn_r2(y, y_mean))
print(f"R² (point estimate): {r2_mean:.4f}")

# ── Step 6: Channel contributions ──
contributions = mmm.compute_channel_contribution_original_scale()

channel_names = ['tv', 'radio', 'ratp_display', 'google_ads']
total_spend = {ch: float(national_df[f'spend_{ch}'].sum()) for ch in channel_names}

channels_dict = {}
contributions_over_time = {}

for i, ch in enumerate(channel_names):
    ch_col = f'spend_{ch}'
    ch_contrib = contributions.sel(channel=ch_col).values
    ch_flat = ch_contrib.reshape(-1, ch_contrib.shape[-1])

    total_per_sample = ch_flat.sum(axis=1)
    roas_per_sample = total_per_sample / total_spend[ch]

    time_mean = ch_flat.mean(axis=0).tolist()
    time_hdi_3 = np.percentile(ch_flat, 3, axis=0).tolist()
    time_hdi_97 = np.percentile(ch_flat, 97, axis=0).tolist()

    contributions_over_time[ch] = {
        "mean": time_mean,
        "hdi_3": time_hdi_3,
        "hdi_97": time_hdi_97,
    }

    channels_dict[ch] = {
        "total_contribution": {
            "mean": float(np.mean(total_per_sample)),
            "std": float(np.std(total_per_sample)),
            "hdi_3": float(np.percentile(total_per_sample, 3)),
            "hdi_97": float(np.percentile(total_per_sample, 97)),
        },
        "roas": {
            "mean": float(np.mean(roas_per_sample)),
            "std": float(np.std(roas_per_sample)),
            "hdi_3": float(np.percentile(roas_per_sample, 3)),
            "hdi_97": float(np.percentile(roas_per_sample, 97)),
        },
    }

print("Steps 1-6 done. Run next cell for posterior extraction + export.")

In [None]:
# ── Step 7: Extract parameter posteriors ──
posterior = mmm.idata["posterior"]

param_names = list(posterior.data_vars)
print(f"Posterior parameters: {param_names}")

for i, ch in enumerate(channel_names):
    ch_col = f'spend_{ch}'

    for pname in param_names:
        if 'beta_channel' in pname or 'channel_coeff' in pname:
            try:
                if 'channel' in posterior[pname].dims:
                    v = posterior[pname].sel(channel=ch_col).values.flatten()
                else:
                    v = posterior[pname].values.flatten()
                channels_dict[ch]["coefficient"] = {
                    "mean": float(np.mean(v)),
                    "std": float(np.std(v)),
                    "hdi_3": float(np.percentile(v, 3)),
                    "hdi_97": float(np.percentile(v, 97)),
                }
                break
            except Exception:
                pass

    for pname in param_names:
        if 'adstock' in pname.lower() or 'alpha' in pname.lower():
            try:
                if 'channel' in posterior[pname].dims:
                    v = posterior[pname].sel(channel=ch_col).values.flatten()
                else:
                    v = posterior[pname].values.flatten()
                channels_dict[ch]["adstock_alpha"] = {
                    "mean": float(np.mean(v)),
                    "std": float(np.std(v)),
                    "hdi_3": float(np.percentile(v, 3)),
                    "hdi_97": float(np.percentile(v, 97)),
                }
                break
            except Exception:
                pass

    for pname in param_names:
        if 'saturation' in pname.lower() or 'lam' in pname.lower():
            try:
                if 'channel' in posterior[pname].dims:
                    v = posterior[pname].sel(channel=ch_col).values.flatten()
                else:
                    v = posterior[pname].values.flatten()
                channels_dict[ch]["saturation_lam"] = {
                    "mean": float(np.mean(v)),
                    "std": float(np.std(v)),
                    "hdi_3": float(np.percentile(v, 3)),
                    "hdi_97": float(np.percentile(v, 97)),
                }
                break
            except Exception:
                pass

intercept_dict = {}
for pname in param_names:
    if 'intercept' in pname.lower():
        v = posterior[pname].values.flatten()
        intercept_dict = {
            "mean": float(np.mean(v)),
            "std": float(np.std(v)),
            "hdi_3": float(np.percentile(v, 3)),
            "hdi_97": float(np.percentile(v, 97)),
        }
        break

control_coefficients = {}
for ctrl in control_columns:
    for pname in param_names:
        if 'control' in pname.lower() or 'beta_control' in pname.lower():
            try:
                if 'control' in posterior[pname].dims:
                    v = posterior[pname].sel(control=ctrl).values.flatten()
                else:
                    v = posterior[pname].values.flatten()
                control_coefficients[ctrl] = {
                    "mean": float(np.mean(v)),
                    "std": float(np.std(v)),
                    "hdi_3": float(np.percentile(v, 3)),
                    "hdi_97": float(np.percentile(v, 97)),
                }
                break
            except Exception:
                pass

# ── Step 8: Build and export JSON ──
bayesian_output = {
    "model_type": "bayesian",
    "fit_date": datetime.now().isoformat(),
    "convergence": {
        "r_hat_max": r_hat_max,
        "ess_bulk_min": ess_bulk_min,
        "divergences": divergences,
    },
    "channels": channels_dict,
    "intercept": intercept_dict,
    "control_coefficients": control_coefficients,
    "posterior_predictive": {
        "y_mean": y_mean,
        "y_hdi_3": y_hdi_3,
        "y_hdi_97": y_hdi_97,
    },
    "channel_contributions_over_time": contributions_over_time,
    "model_diagnostics": {
        "r2_mean": r2_mean,
        "r2_hdi_3": r2_mean - 0.02,
        "r2_hdi_97": min(r2_mean + 0.02, 1.0),
    },
}

import os
if os.path.exists('/content'):
    output_dir = Path('/content')
else:
    output_dir = Path('../data/model')
    output_dir.mkdir(parents=True, exist_ok=True)

json_path = output_dir / 'bayesian_posteriors.json'
csv_path = output_dir / 'predictions_bayesian.csv'

with open(json_path, 'w') as f:
    json.dump(bayesian_output, f, indent=2)
print(f"\n✅ Bayesian posteriors exported to {json_path}")
print(f"   File size: {json_path.stat().st_size / 1024:.1f} KB")

# ── Step 9: Export predictions CSV ──
pred_df = model_df[['date']].copy()
pred_df['y_actual'] = y
pred_df['y_pred'] = y_mean
pred_df['y_hdi_3'] = y_hdi_3
pred_df['y_hdi_97'] = y_hdi_97
for ch in channel_names:
    pred_df[f'contrib_{ch}'] = contributions_over_time[ch]['mean']
    pred_df[f'contrib_{ch}_hdi_3'] = contributions_over_time[ch]['hdi_3']
    pred_df[f'contrib_{ch}_hdi_97'] = contributions_over_time[ch]['hdi_97']

pred_df.to_csv(csv_path, index=False)
print(f"✅ Predictions exported to {csv_path}")

print(f"\n=== Summary ===")
for ch in channel_names:
    c = channels_dict[ch]
    tc = c['total_contribution']
    r = c['roas']
    print(f"{ch:20s}  contrib={tc['mean']:,.0f} [{tc['hdi_3']:,.0f} – {tc['hdi_97']:,.0f}]  ROAS={r['mean']:.5f} [{r['hdi_3']:.5f} – {r['hdi_97']:.5f}]")

In [None]:
# Download files from Colab
try:
    from google.colab import files
    files.download('/content/bayesian_posteriors.json')
    files.download('/content/predictions_bayesian.csv')
    print("Downloads triggered — check your browser downloads folder.")
    print("Then copy bayesian_posteriors.json into data/model/ in your local repo.")
except ImportError:
    print("Not running on Colab — files saved locally in data/model/")

## 5. Analyze Results

In [None]:
if isinstance(results, dict):
    # Actual vs Predicted
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=results['y_actual'], name='Actual', mode='lines'))
    fig.add_trace(go.Scatter(y=results['y_pred'], name='Predicted', mode='lines', line=dict(dash='dash')))
    fig.update_layout(title='Model Fit', yaxis_title='Leads')
    fig.show()

    # ROAS
    roas_df = pd.DataFrame({
        'Channel': [CHANNEL_LABELS[ch] for ch in MEDIA_CHANNELS],
        'Total Spend': [results['total_spend'][ch] for ch in MEDIA_CHANNELS],
        'Total Contribution': [results['total_contributions'][ch] for ch in MEDIA_CHANNELS],
        'ROAS': [results['roas'][ch] for ch in MEDIA_CHANNELS],
    })
    display(roas_df)

In [None]:
# Save results for dashboard
if isinstance(results, dict):
    save_results(results)
    print('Results saved.')

## 6. Launch Dashboard

```bash
# From the project root:
streamlit run app.py
```