<a href="https://colab.research.google.com/github/jingjieyuan573-bite/Composite_Distribution_Monte_Carlo_simulation/blob/main/Composite_Monte_Carlo_Simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Composite Monte Carlo Simulation (Fixed, Corrected, and Sensitivity-Enhanced)

This script runs Monte Carlo comparisons between three models:
- Composite (center: skew-normal; tails: skew-t)
- Skew-t
- Skew-normal

Features:
- monte_carlo_sim(M,n): one Monte Carlo experiment
- bootstrap_ci: bootstrap 95% CI for grouped means
- run_sample_size_sensitivity: run experiments for multiple n and produce CSV + LaTeX output

This version fixes all syntax issues and unterminated string literals and adds full sample-size sensitivity analysis with bootstrap CIs, directly displaying results.
"""

import time
import numpy as np
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ---------- RNG helper ----------

def ensure_rng(rs=None):
    if rs is None:
        return np.random.RandomState(None)
    if isinstance(rs, (int, np.integer)):
        return np.random.RandomState(int(rs))
    if isinstance(rs, np.random.RandomState):
        return rs
    return np.random.RandomState(None)

# ---------- Skew-t sampling ----------

def skewt_rvs(xi=0.0, omega=1.0, alpha=0.0, nu=8.0, size=1, random_state=None):
    rng = ensure_rng(random_state)
    U = stats.skewnorm.rvs(alpha, loc=0.0, scale=1.0, size=size, random_state=rng)
    S = rng.chisquare(nu, size=size)
    return xi + omega * U * np.sqrt(nu / S)

# ---------- Composite sampling ----------

def composite_rvs(size=1, xi=0.0, omega=1.0, alpha=15.0, nu=6.0,
                  theta1=-0.3, theta2=0.3, left_frac=0.08, right_frac=0.08, random_state=None):
    rng = ensure_rng(random_state)
    n = int(size)
    samples = np.empty(n, dtype=float)
    u = rng.rand(n)
    r1 = left_frac; r2 = right_frac
    comp = np.where(u < r1, 0, np.where(u < r1 + (1 - r1 - r2), 1, 2))

    for k in (0, 1, 2):
        idx = np.where(comp == k)[0]
        if idx.size == 0:
            continue
        need = idx.size
        draws = []
        while len(draws) < need:
            if k == 1:
                batch = stats.skewnorm.rvs(alpha, loc=xi, scale=omega, size=max(need * 2, 500), random_state=rng)
                valid = batch[(batch > theta1) & (batch < theta2)]
            else:
                batch = skewt_rvs(xi=xi, omega=omega, alpha=alpha, nu=nu, size=max(need * 2, 500), random_state=rng)
                if k == 0:
                    valid = batch[batch <= theta1]
                else:
                    valid = batch[batch >= theta2]
            draws.extend(valid.tolist())
        samples[idx] = np.array(draws[:need])
    return samples

# ---------- Empirical CDF factory ----------

def empirical_cdf_factory(sample):
    s = np.sort(np.asarray(sample))
    n = len(s)
    def cdf(x):
        x = np.asarray(x)
        return np.searchsorted(s, x, side='right') / float(n)
    return cdf

# ---------- Tail KS ----------

def compute_tailks(cdf_func, data_sorted, upper_pct, lower_pct):
    lower_val = np.percentile(data_sorted, lower_pct)
    upper_val = np.percentile(data_sorted, upper_pct)
    tail_points = np.concatenate([data_sorted[data_sorted <= lower_val], data_sorted[data_sorted >= upper_val]])
    if len(tail_points) == 0:
        return np.nan
    fitted_vals = np.asarray(cdf_func(tail_points))
    n = len(data_sorted)
    empirical_vals = np.searchsorted(data_sorted, tail_points, side='right') / float(n)
    return float(np.max(np.abs(fitted_vals - empirical_vals)))

# ---------- Log-likelihood ----------

def loglikelihood(model_name, x):
    x = np.asarray(x)
    if model_name == 'Composite':
        ll = np.zeros_like(x, dtype=float)
        left_mask = x <= -0.3
        mid_mask = (x > -0.3) & (x < 0.3)
        right_mask = x >= 0.3
        if np.any(left_mask):
            ll[left_mask] = stats.t.pdf(x[left_mask], df=6, loc=0, scale=1)
        if np.any(mid_mask):
            ll[mid_mask] = stats.skewnorm.pdf(x[mid_mask], a=15, loc=0, scale=1)
        if np.any(right_mask):
            ll[right_mask] = stats.t.pdf(x[right_mask], df=6, loc=0, scale=1)
        return np.log(np.maximum(ll, 1e-12))
    elif model_name == 'Skew-t':
        return np.log(stats.t.pdf(x, df=5, loc=0, scale=1))
    else:
        return np.log(stats.skewnorm.pdf(x, a=6, loc=0, scale=1))

# ---------- Monte Carlo simulation ----------

def monte_carlo_sim(M=50, n=2000, seed=1, rng=None, mc_sample_size=20000):
    """
    Run Monte Carlo: generate data from composite DGP and evaluate three candidate models.
    For each replication we :
      - draw one dataset from the composite DGP;
      - fit simple parametric approximations to each candidate model using the dataset (so fitted CDFs reflect estimation error);
      - draw a large parametric sample from each fitted model (mc_sample_size) to form the fitted CDF;
      - compute TailKS between the observed data and each fitted model; compute bias/MSE from model-implied draws;
      - compute average log-likelihood of the observed data under the fitted model.

    This approach avoids TailKS==0 (which happens when comparing the dataset to its own empirical CDF)
    and gives a realistic, non-degenerate comparison.
    """
    rng = ensure_rng(rng if rng is not None else seed)
    rows = []

    for rep in range(int(M)):
        # draw one dataset from the composite DGP
        data = composite_rvs(size=n, xi=0.0, omega=1.0, alpha=15.0, nu=6.0,
                             theta1=-0.3, theta2=0.3, left_frac=0.08, right_frac=0.08, random_state=rng)
        data_sorted = np.sort(data)

        # split observed data for composite-fitting convenience
        theta1, theta2 = -0.3, 0.3
        left_obs = data[data <= theta1]
        mid_obs = data[(data > theta1) & (data < theta2)]
        right_obs = data[data >= theta2]
        n_left, n_mid, n_right = len(left_obs), len(mid_obs), len(right_obs)

        for model_name in ['Composite', 'Skew-t', 'Skew-normal']:
            # Fit model parameters from the observed data (simple, robust fits)
            try:
                if model_name == 'Composite':
                    # estimate region weights
                    r1_est = max(1e-6, n_left / float(n))
                    r2_est = max(1e-6, n_right / float(n))
                    rmid_est = max(1e-6, 1.0 - r1_est - r2_est)

                    # fit center skew-normal if enough data, else fallback to canonical
                    if n_mid >= 10:
                        try:
                            a_c, loc_c, scale_c = stats.skewnorm.fit(mid_obs)
                        except Exception:
                            a_c, loc_c, scale_c = 15.0, 0.0, 1.0
                    else:
                        a_c, loc_c, scale_c = 15.0, 0.0, 1.0

                    # fit tails (t) if enough data
                    if n_left >= 10:
                        try:
                            df_l, loc_l, scale_l = stats.t.fit(left_obs)
                        except Exception:
                            df_l, loc_l, scale_l = 6.0, 0.0, 1.0
                    else:
                        df_l, loc_l, scale_l = 6.0, 0.0, 1.0

                    if n_right >= 10:
                        try:
                            df_r, loc_r, scale_r = stats.t.fit(right_obs)
                        except Exception:
                            df_r, loc_r, scale_r = 6.0, 0.0, 1.0
                    else:
                        df_r, loc_r, scale_r = 6.0, 0.0, 1.0

                    # build a parametric sample from the fitted composite model
                    n_left_s = max(1, int(mc_sample_size * r1_est))
                    n_mid_s = max(1, int(mc_sample_size * rmid_est))
                    n_right_s = max(1, mc_sample_size - n_left_s - n_mid_s)

                    left_sample = stats.t.rvs(df_l, loc=loc_l, scale=scale_l, size=n_left_s, random_state=rng)
                    mid_sample = stats.skewnorm.rvs(a_c, loc=loc_c, scale=scale_c, size=n_mid_s, random_state=rng)
                    right_sample = stats.t.rvs(df_r, loc=loc_r, scale=scale_r, size=n_right_s, random_state=rng)

                    param_sample = np.concatenate([left_sample, mid_sample, right_sample])

                    # compute per-observation log-likelihood under the fitted composite
                    ll_vals = np.empty_like(data)
                    left_mask = data <= theta1
                    mid_mask = (data > theta1) & (data < theta2)
                    right_mask = data >= theta2
                    if np.any(left_mask):
                        ll_vals[left_mask] = stats.t.pdf(data[left_mask], df=df_l, loc=loc_l, scale=scale_l)
                    if np.any(mid_mask):
                        ll_vals[mid_mask] = stats.skewnorm.pdf(data[mid_mask], a=a_c, loc=loc_c, scale=scale_c)
                    if np.any(right_mask):
                        ll_vals[right_mask] = stats.t.pdf(data[right_mask], df=df_r, loc=loc_r, scale=scale_r)
                    avg_loglike = float(np.mean(np.log(np.maximum(ll_vals, 1e-12))))

                elif model_name == 'Skew-t':
                    # approximate skew-t by fitting a Student-t (no skew param in scipy)
                    try:
                        df_t, loc_t, scale_t = stats.t.fit(data)
                    except Exception:
                        df_t, loc_t, scale_t = 6.0, 0.0, 1.0
                    param_sample = stats.t.rvs(df_t, loc=loc_t, scale=scale_t, size=mc_sample_size, random_state=rng)
                    avg_loglike = float(np.mean(stats.t.logpdf(data, df=df_t, loc=loc_t, scale=scale_t)))

                else:  # Skew-normal
                    try:
                        a_sn, loc_sn, scale_sn = stats.skewnorm.fit(data)
                    except Exception:
                        a_sn, loc_sn, scale_sn = 6.0, 0.0, 1.0
                    param_sample = stats.skewnorm.rvs(a_sn, loc=loc_sn, scale=scale_sn, size=mc_sample_size, random_state=rng)
                    avg_loglike = float(np.mean(stats.skewnorm.logpdf(data, a_sn, loc=loc_sn, scale=scale_sn)))

            except Exception as e:
                # fallback: use canonical parameters if any fit fails
                if model_name == 'Composite':
                    param_sample = composite_rvs(size=mc_sample_size, random_state=rng)
                    avg_loglike = float(np.mean(loglikelihood('Composite', data)))
                elif model_name == 'Skew-t':
                    param_sample = skewt_rvs(size=mc_sample_size, random_state=rng)
                    avg_loglike = float(np.mean(loglikelihood('Skew-t', data)))
                else:
                    param_sample = stats.skewnorm.rvs(a=6.0, loc=0.0, scale=1.0, size=mc_sample_size, random_state=rng)
                    avg_loglike = float(np.mean(loglikelihood('Skew-normal', data)))

            # make fitted cdf from the parametric sample (reflects estimated parameters)
            fitted_cdf = empirical_cdf_factory(param_sample)

            # TailKS comparing observed data vs fitted model CDF
            tailks = compute_tailks(fitted_cdf, data_sorted, 95, 5)

            # compute model-implied moments (draw a size-n sample from param_sample)
            sim_draw = rng.choice(param_sample, size=n, replace=False) if len(param_sample) >= n else rng.choice(param_sample, size=n, replace=True)
            bias_mean = float(np.mean(sim_draw))
            mse_mean = float(np.mean((sim_draw - 0.0) ** 2))

            rows.append({'rep': int(rep), 'model': model_name, 'avg_loglike': avg_loglike,
                         'tailks': tailks, 'bias_mean': bias_mean, 'mse_mean': mse_mean})

    return pd.DataFrame(rows)

# ---------- Bootstrap CI ----------

def bootstrap_ci(df, model, col, B=500, alpha=0.05, rng=None):
    rng = ensure_rng(rng)
    vals = df[df['model'] == model][col].values
    n = len(vals)
    if n == 0:
        return (np.nan, np.nan)
    boot_means = [np.mean(rng.choice(vals, size=n, replace=True)) for _ in range(int(B))]
    lo = np.percentile(boot_means, 100 * alpha / 2)
    hi = np.percentile(boot_means, 100 * (1 - alpha / 2))
    return float(lo), float(hi)

# ---------- Run sample-size sensitivity with bootstrap ----------

def run_sample_size_sensitivity(sample_sizes=[500, 2000, 5000], M=500, B_boot=500, seed=1234, quick=False):
    rng = ensure_rng(seed)
    M_use, B_use = (max(10, int(M // 10)), max(50, int(B_boot // 10))) if quick else (int(M), int(B_boot))

    results = []

    for n in sample_sizes:
        t0 = time.time()
        print(f"\nRunning Monte Carlo simulation with sample size {n}")
        df = monte_carlo_sim(M=M_use, n=n, seed=seed, rng=rng)

        summary = df.groupby('model')[['avg_loglike','tailks','bias_mean','mse_mean']].mean().reset_index()
        print("\nMonte Carlo results (averaged over repetitions):")
        print(summary)

        # Compute bootstrap CIs
        for model in summary['model']:
            ci_avgloglike = bootstrap_ci(df, model, 'avg_loglike', B=B_use, rng=rng)
            ci_tailks = bootstrap_ci(df, model, 'tailks', B=B_use, rng=rng)
            ci_bias = bootstrap_ci(df, model, 'bias_mean', B=B_use, rng=rng)
            ci_mse = bootstrap_ci(df, model, 'mse_mean', B=B_use, rng=rng)
            print(f"\n{model} 95% CI:")
            print(f"  Avg LOGLIKE: {ci_avgloglike}")
            print(f"  Tail KS   : {ci_tailks}")
            print(f"  Bias      : {ci_bias}")
            print(f"  MSE       : {ci_mse}")

    return df, summary

if __name__ == "__main__":
    # 小规模测试，快速看到输出
    df, summary = run_sample_size_sensitivity(
        sample_sizes=[500, 2000],  # 这里可以改回 [500,2000,5000]
        M=50,                      # 可以改回 500
        B_boot=100,                # 可以改回 500
        seed=1234,
        quick=True                 # quick=True 只做少量重复，快速看到结果
    )



Running Monte Carlo simulation with sample size 500

Monte Carlo results (averaged over repetitions):
         model  avg_loglike    tailks  bias_mean  mse_mean
0    Composite     0.852765  0.011545   0.196687  0.222386
1  Skew-normal    -0.293357  0.081285   0.229782  0.173250
2       Skew-t     0.126865  0.028655   0.144405  0.510046

Composite 95% CI:
  Avg LOGLIKE: (0.8300208926767781, 0.8801674344583343)
  Tail KS   : (0.01012375, 0.015071375)
  Bias      : (0.18646065655867167, 0.2109603242468765)
  MSE       : (0.16500600204345614, 0.3103000304804865)

Skew-normal 95% CI:
  Avg LOGLIKE: (-0.34018158269922316, -0.2165260576940988)
  Tail KS   : (0.03389912500000002, 0.13128499999999996)
  Bias      : (0.21433213170218593, 0.2495441182729316)
  MSE       : (0.15268569079522235, 0.19174285450631293)

Skew-t 95% CI:
  Avg LOGLIKE: (0.07947304582532919, 0.1754090825075892)
  Tail KS   : (0.024692500000000017, 0.03316112500000003)
  Bias      : (0.12982767701058265, 0.155627207460451