Scenario #01

In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# 1. Load Data
df = yf.download('AAPL', start='2020-09-21', end='2025-09-21')
df['LogReturn'] = np.log(df['Close'] / df['Close'].shift(1))
returns = df['LogReturn'].dropna().values  # ~1,260 daily returns

# 2. Bayesian Volatility Model
with pm.Model() as model:
    # Priors: Normal-Inverse-Gamma for mean (mu) and variance (sigma2)
    mu = pm.Normal('mu', mu=0, sigma=0.01)
    sigma = pm.InverseGamma('sigma', alpha=2, beta=0.1)
    # Likelihood
    returns_obs = pm.Normal('returns_obs', mu=mu, sigma=sigma, observed=returns)
    # Predictive variable for future returns
    pred_returns = pm.Normal('pred_returns', mu=mu, sigma=sigma, shape=30)  # 30-day forecast
    # Sample posterior
    trace = pm.sample(1000, tune=1000, return_inferencedata=True)

# 3. Posterior Predictive Sampling
with model:
    pred_trace = pm.sample_posterior_predictive(trace, var_names=['pred_returns'])

# 4. Extract Posterior and Predictive Samples
posterior = az.extract(trace)
mu_samples = posterior['mu'].values  # Shape: (4000,) after flattening chains
sigma_samples = posterior['sigma'].values  # Shape: (4000,)
pred_samples = pred_trace.posterior_predictive['pred_returns'].values  # Shape: (chains, draws, 30)

# Save trace for reproducibility
az.to_netcdf(trace, 'trace.nc')
az.to_netcdf(pred_trace, 'pred_trace.nc')

# 5. Interactive Visualization
fig = make_subplots(rows=2, cols=2, subplot_titles=('Prior vs Posterior: Mu', 'Prior vs Posterior: Sigma',
                                                    'MCMC Trace: Mu', 'Predictive Returns (30 Days)'))

# Prior vs Posterior: Mu
fig.add_trace(go.Histogram(x=np.random.normal(0, 0.01, 1000), name='Prior Mu', opacity=0.5), row=1, col=1)
fig.add_trace(go.Histogram(x=mu_samples, name='Posterior Mu', opacity=0.5), row=1, col=1)

# Prior vs Posterior: Sigma
fig.add_trace(go.Histogram(x=np.random.gamma(2, 0.1, 1000), name='Prior Sigma', opacity=0.5), row=1, col=2)
fig.add_trace(go.Histogram(x=sigma_samples, name='Posterior Sigma', opacity=0.5), row=1, col=2)

# MCMC Trace: Mu
fig.add_trace(go.Scatter(x=np.arange(len(mu_samples)), y=mu_samples, mode='lines', name='Mu Trace'), row=2, col=1)

# Predictive Returns: Mean and 95% CI
pred_mean = pred_samples.mean(axis=(0, 1))  # Mean over chains and draws
pred_ci = np.percentile(pred_samples, [2.5, 97.5], axis=(0, 1))  # 95% credible interval
fig.add_trace(go.Scatter(x=np.arange(30), y=pred_mean, mode='lines', name='Mean Pred Returns'), row=2, col=2)
fig.add_trace(go.Scatter(x=np.arange(30), y=pred_ci[0], mode='lines', name='95% CI Lower', line=dict(dash='dash')), row=2, col=2)
fig.add_trace(go.Scatter(x=np.arange(30), y=pred_ci[1], mode='lines', name='95% CI Upper', line=dict(dash='dash')), row=2, col=2)

fig.update_layout(title='Bayesian Volatility Analysis for AAPL', showlegend=True)
fig.write_html('volatility_viz.html')  # Export for GitHub Pages

  df = yf.download('AAPL', start='2020-09-21', end='2025-09-21')
[*********************100%***********************]  1 of 1 completed
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu, sigma, pred_returns]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 3 seconds.
Sampling: [pred_returns]


Output()

In [3]:
pred_trace 

---
Scenario #02

In [6]:
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# 1. Load and Preprocess Data (Updated URL)
from ucimlrepo import fetch_ucirepo
online_retail = fetch_ucirepo(id=352)  # UCI ID for Online Retail
df = online_retail.data.original  # Raw dataframe
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Compute time-to-next-purchase
df = df.sort_values(['CustomerID', 'InvoiceDate'])
df['NextPurchase'] = df.groupby('CustomerID')['InvoiceDate'].shift(-1)
df['TimeToNext'] = (df['NextPurchase'] - df['InvoiceDate']).dt.days
df['Censored'] = df['TimeToNext'].isna().astype(int)  # 1 if no next purchase (censored)
df['TimeToNext'] = df['TimeToNext'].fillna(365)  # Censor at 1 year
df['TimeToNext'] = df['TimeToNext'].clip(lower=0.1)  # Ensure times > 0 (add 0.1 for same-day purchases)

# Aggregate by customer, select top countries
df['Country'] = df['Country'].replace(['EIRE', 'Channel Islands'], 'Other')
top_countries = df['Country'].value_counts().head(5).index
df = df[df['Country'].isin(top_countries)]
customer_data = df.groupby(['CustomerID', 'Country']).agg({
    'TimeToNext': 'min',
    'Censored': 'min'
}).reset_index()

# Encode countries and validate shapes
customer_data = customer_data.reset_index(drop=True)
country_idx = pd.Categorical(customer_data['Country']).codes
times = customer_data['TimeToNext'].values
censored = customer_data['Censored'].values
n_customers = len(customer_data)
assert len(country_idx) == len(times) == len(censored) == n_customers, "Shape mismatch in data arrays"
assert all(times > 0), f"Invalid times: {times[times <= 0]}"  # Ensure all times positive

# 2. Bayesian Weibull Survival Model
with pm.Model() as survival_model:
    # Hyperpriors for country-level parameters
    mu_alpha = pm.HalfNormal('mu_alpha', sigma=2)  # Positive prior
    sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=1)
    mu_beta = pm.HalfNormal('mu_beta', sigma=2)    # Positive prior
    sigma_beta = pm.HalfNormal('sigma_beta', sigma=1)
    
    # Country-specific parameters (positive)
    n_countries = len(np.unique(country_idx))
    alpha = pm.HalfNormal('alpha', sigma=sigma_alpha, shape=n_countries)
    beta = pm.HalfNormal('beta', sigma=sigma_beta, shape=n_countries)
    
    # Map parameters to customers
    alpha_i = alpha[country_idx]
    beta_i = beta[country_idx]
    
    # Weibull likelihood for uncensored data
    pm.Weibull('t_uncensored', alpha=alpha_i[censored == 0], beta=beta_i[censored == 0], 
               observed=times[censored == 0])
    
    # Log-survival for censored data
    censored_logsurv = -((times[censored == 1] / beta_i[censored == 1]) ** alpha_i[censored == 1])
    pm.Potential('censored_logsurv', censored_logsurv.sum())
    
    # Predictive survival times
    pred_times = pm.Weibull('pred_times', alpha=alpha_i, beta=beta_i, shape=n_customers)
    
    # Sample posterior
    trace = pm.sample(1000, tune=1000, return_inferencedata=True, target_accept=0.9)

# 3. Posterior Predictive Sampling
with survival_model:
    pred_trace = pm.sample_posterior_predictive(trace, var_names=['pred_times'])

# 4. Extract Posterior and Predictive Samples
posterior = az.extract(trace)
alpha_samples = posterior['alpha'].values  # Shape: (chains*draws, n_countries)
beta_samples = posterior['beta'].values    # Shape: (chains*draws, n_countries)
pred_times_samples = pred_trace.posterior_predictive['pred_times'].values  # Shape: (chains, draws, n_customers)

# Save traces
az.to_netcdf(trace, 'survival_trace.nc')
az.to_netcdf(pred_trace, 'survival_pred_trace.nc')

# 5. Interactive Visualization
fig = make_subplots(rows=1, cols=2, subplot_titles=('Posterior Alpha by Country', 'Survival Curves by Country'))

# Posterior Alpha
countries = pd.Categorical(customer_data['Country']).categories
colors = ['blue', 'red', 'green', 'orange', 'purple']
for i, country in enumerate(countries):
    fig.add_trace(go.Histogram(x=alpha_samples[:, i], name=f'Alpha: {country}', 
                               marker_color=colors[i], opacity=0.6), row=1, col=1)

# Survival Curves with Slider
t = np.linspace(0.1, 365, 100)  # Start at 0.1 to avoid log(0)
traces = []
for i, country in enumerate(countries):
    alpha_mean = alpha_samples[:, i].mean()
    beta_mean = beta_samples[:, i].mean()
    survival = np.exp(-((t / beta_mean) ** alpha_mean))
    trace = go.Scatter(x=t, y=survival, mode='lines', name=f'Survival: {country}',
                       line=dict(color=colors[i]), visible=(i == 0))
    traces.append(trace)
    fig.add_trace(trace, row=1, col=2)

# Slider for country selection
steps = [
    {'method': 'restyle', 'label': country, 'args': [{'visible': [j == i for j in range(len(countries))]}]}
    for i, country in enumerate(countries)
]
fig.update_layout(
    title='Bayesian Survival Analysis for Customer Lifetime Value',
    xaxis2_title='Days to Next Purchase', yaxis2_title='Survival Probability',
    sliders=[{'steps': steps, 'active': 0, 'currentvalue': {'prefix': 'Country: '}}],
    showlegend=True
)
fig.write_html('survival_viz.html')

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_alpha, sigma_alpha, mu_beta, sigma_beta, alpha, beta, pred_times]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 223 seconds.
  pred_trace = pm.sample_posterior_predictive(trace, var_names=['pred_times'])
Sampling: [pred_times]


Output()