# Problem 3

## Problem (a)

### 1. Download(Generate) dynamics dataset

In [None]:
# simulating a 1D CLG model ensemble takes a long time
# so will use a pre-computed ensemble of runs
# done on my local machine, which can be downloaded from the cell above
!wget -O clg_finals_dynamics_10000samples.npy https://github.com/kimmw3002/CLP_HW1/raw/refs/heads/main/clg_finals_dynamics_10000samples.npy

In [None]:
# clg.py
# on windows, make this a separate .py file to avoid issues with multiprocessing
# can be run directly as a cell in colab

# simulating a 1D CLG model ensemble takes a long time
# so will use a pre-computed ensemble of runs
# done on my local machine, which can be downloaded from the cell above
# in theory, you could run this to generate the ensemble on colab
import numpy as np
import warnings
from concurrent.futures import ProcessPoolExecutor

# ---------- single‑run simulator ----------
def run_CLG_until_absorbing(L, N, max_steps=1_000_000, rng=None):
    rng = rng or np.random.default_rng()
    conf = np.zeros(L, dtype=np.int8)
    conf[rng.choice(L, N, replace=False)] = 1

    for _ in range(max_steps):
        occ = conf == 1
        left_occ   = np.roll(occ,  1)
        right_occ  = np.roll(occ, -1)
        left_emp   = np.roll(conf,  1) == 0
        right_emp  = np.roll(conf, -1) == 0
        movable = np.where(occ & (left_occ | right_occ) & (left_emp | right_emp))[0]
        if movable.size == 0:
            return conf
        i = rng.choice(movable)
        targets = [(i - 1) % L] if left_emp[i] else []
        if right_emp[i]:
            targets.append((i + 1) % L)
        j = rng.choice(targets)
        conf[j], conf[i] = 1, 0
    warnings.warn("max_steps reached before absorbing", RuntimeWarning)
    return conf

# ---------- helper for the pool ----------
def _worker(args):
    L, N, max_steps, seed = args
    return run_CLG_until_absorbing(L, N, max_steps, rng=np.random.default_rng(seed))

# ---------- public ensemble ----------
def run_ensemble_cpu(L, N, runs, max_steps=1_000_000, n_workers=None, seed=None):
    master = np.random.default_rng(seed)
    seeds  = master.integers(0, 2**63 - 1, size=runs)
    args   = [(L, N, max_steps, s) for s in seeds]
    with ProcessPoolExecutor(max_workers=n_workers) as pool:
        finals = list(pool.map(_worker, args, chunksize=1))
    return np.array(finals)            # shape (runs, L)

# ---------- run the ensemble ----------
if __name__ == "__main__":
    L, rho, R = 10_000, 0.4, 10000    # lattice, density, #runs
    finals = run_ensemble_cpu(L, int(L*rho), R, n_workers=None, seed=42)
    np.save("clg_finals_dynamics_10000samples.npy", finals)
    print("CPU ensemble done & saved.")

### 2. Generate combinatorics dataset

In [3]:
import numpy as np

def run_dataset2_fast(L, N, C):
    """
    Returns a (C, L) array of 0/1 rows, each with N ones and no two 1s adjacent.
    """
    M = L - N + 1
    # 1) pick N “slots” in [0..M-1] per row
    rand = np.random.rand(C, M)
    idxs = np.argpartition(rand, N-1, axis=1)[:, :N]
    idxs.sort(axis=1)

    # 2) shift so no two 1’s touch
    offsets   = np.arange(N)
    positions = idxs + offsets  # (C, N)

    # 3) scatter into zero array
    data = np.zeros((C, L), dtype=np.int8)
    rows = np.arange(C)[:, None]
    data[rows, positions] = 1

    return data

L, N, C = 10000, 4000, 10000   # parameters for the dataset
data = run_dataset2_fast(L, N, C)
# save to .npy
np.save("clg_finals_combinatorics_10000samples.npy", data)
print(f"Saved to clg_finals_combinatorics_10000samples.npy")

Saved to clg_finals_combinatorics_10000samples.npy


## Problem (b)

In [4]:
import numpy as np
import zlib

def compress_lza(data_bytes):
    """
    Placeholder LZ77‐based compressor wrapper.
    Currently uses zlib (DEFLATE) for fast C‐based compression.
    Returns compressed byte‐length.
    """
    return len(zlib.compress(data_bytes))

def compute_cid_stats(dataset, compressor):
    """
    Returns (mean_cid, std_cid) for CID = compressed_length / original_length
    computed over each row of `dataset` (shape (C, L)).
    """
    C, L = dataset.shape
    cids = np.empty(C, dtype=float)
    for i in range(C):
        row_bytes   = dataset[i].astype(np.uint8).tobytes()
        compressed  = compressor(row_bytes)
        cids[i]     = compressed / len(row_bytes)
    return cids.mean(), cids.std()

dyn_file  = "clg_finals_dynamics_10000samples.npy"
comb_file = "clg_finals_combinatorics_10000samples.npy"

# load datasets
dynamics      = np.load(dyn_file)
combinatorics = np.load(comb_file)

# compute mean and standard deviation of CID
mean_dyn,  std_dyn  = compute_cid_stats(dynamics,      compress_lza)
mean_comb, std_comb = compute_cid_stats(combinatorics, compress_lza)

print(f"Dynamics CID:      mean = {mean_dyn:.6f}, std = {std_dyn:.6f}")
print(f"Combinatorics CID: mean = {mean_comb:.6f}, std = {std_comb:.6f}")

Dynamics CID:      mean = 0.095458, std = 0.001225
Combinatorics CID: mean = 0.106395, std = 0.000810


- **Dynamics**: $\bar x_1 = 0.095458$, $\sigma_1 = 0.001225$, $N_1 = 10000$  
- **Combinatorics**: $\bar x_2 = 0.106395$, $\sigma_2 = 0.000810$, $N_2 = 10000$  

---

1. **Standard error of each mean**

   $$
   \mathrm{SE}_1 = \frac{\sigma_1}{\sqrt{N_1}}
     = \frac{0.001225}{\sqrt{10000}}
     = 1.225\times10^{-5},
   \quad
   \mathrm{SE}_2 = \frac{\sigma_2}{\sqrt{N_2}}
     = \frac{0.000810}{\sqrt{10000}}
     = 8.10\times10^{-6}.
   $$

2. **Standard error of the difference**

   $$
   \mathrm{SE}_\Delta
     = \sqrt{\mathrm{SE}_1^2 + \mathrm{SE}_2^2}
     = \sqrt{(1.225\times10^{-5})^2 + (8.10\times10^{-6})^2}
     \approx 1.469\times10^{-5}.
   $$

3. **Difference of means**

   $$
   \Delta \mu
     = \bar x_2 - \bar x_1
     = 0.106395 - 0.095458
     = 0.010937.
   $$

4. **Z-score**

   $$
   z
     = \frac{\Delta \mu}{\mathrm{SE}_\Delta}
     = \frac{0.010937}{1.469\times10^{-5}}
     \approx 745.
   $$

5. **Two-sided $p$-value**

   $$
   p
     = 2\,(1 - \Phi(z))
     \approx 0.
   $$

---

**Conclusion:**  
The difference in average CID ($\Delta\mu\approx0.01094$) corresponds to a $z$-score of about 745—hundreds of standard errors apart—so it remains overwhelmingly statistically significant. This proves that the dynamically reached ensemble is different from the combinatorics based ensemble generation. The dynamically reached states are not a uniform probability ensemble of all combinatorically possible states.

## Problem (c)

### 1. Define neural network

In [None]:
# todo

### 2. Train and test model