In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# ================= 0) 光谱网格：380–750 nm，步长 5 nm =================
def make_wavelengths():
    """
    返回: 380–750 nm, 步长 5 nm 的波长数组
    """
    return np.arange(380.0, 750.0 + 1e-9, 5.0)


# ================= 1) 折射率：TiO2 / SiO2 / MgF2（k=0） =================
# TiO2
lam_tab_tio2 = np.array([
    380.0, 425.0, 450.0, 475.0, 500.0, 525.0, 550.0, 575.0, 600.0,
    625.0, 650.0, 675.0, 750.0, 775.0, 800.0, 825.0, 850.0, 900.0,
    1000.0, 1060.0
])
n_tab_tio2 = np.array([
    2.55, 2.49, 2.469, 2.444, 2.422, 2.402, 2.385, 2.37, 2.351,
    2.343, 2.337, 2.331, 2.322, 2.317, 2.313, 2.311, 2.309, 2.305,
    2.300, 2.299
])


def n_tio2(lam_nm):
    return np.interp(lam_nm, lam_tab_tio2, n_tab_tio2)


# SiO2
lam_tab_sio2 = np.array([300.0, 350.0, 400.0, 450.0, 500.0,
                         550.0, 600.0, 650.0, 700.0, 900.0, 1000.0])

n_tab_sio2 = np.array([1.478, 1.472, 1.467, 1.463, 1.459,
                       1.455, 1.452, 1.450, 1.446, 1.437, 1.434])


def n_sio2(lam_nm):
    return np.interp(lam_nm, lam_tab_sio2, n_tab_sio2)


# MgF2
lam_tab_mgf2 = np.array([248.0, 550.0, 1550.0])
n_tab_mgf2 = np.array([1.40, 1.38, 1.36])


def n_mgf2(lam_nm):
    return np.interp(lam_nm, lam_tab_mgf2, n_tab_mgf2)


# 玻璃（常数折射率）
glass_n_const = 1.5163


def n_glass(lam_nm):
    return np.full_like(lam_nm, glass_n_const, dtype=float)


# ================= 2) TMM（正入射，未偏振） =================
def tmm_normal_rt(layers, lam_nm, n_env, n_sub):
    """
    正入射传输矩阵法，计算 R/T。

    参数
    ----
    layers : list of (n_arr, d_nm)
        从空气到基底（topdown）顺序的层列表。
        n_arr 是在 lam_nm 上的折射率数组，d_nm 是该层厚度（nm）。
    lam_nm : np.ndarray
        波长数组（nm）。
    n_env : float
        入射介质折射率（标量）。
    n_sub : float 或 np.ndarray
        最后基底折射率（常数或数组）。

    返回
    ----
    R, T : np.ndarray
        反射率 / 透射率
    """
    wl_c = lam_nm.astype(np.complex128)
    n0 = np.complex128(n_env)
    ns = np.complex128(n_sub)

    # 初始化总传输矩阵为单位阵
    M11 = np.ones_like(wl_c, dtype=np.complex128)
    M12 = np.zeros_like(wl_c, dtype=np.complex128)
    M21 = np.zeros_like(wl_c, dtype=np.complex128)
    M22 = np.ones_like(wl_c, dtype=np.complex128)

    for (n_layer_arr, d_nm) in layers:
        nj = n_layer_arr.astype(np.complex128)
        delta = 2.0 * np.pi * nj * (d_nm / wl_c)
        c, s = np.cos(delta), np.sin(delta)

        A = c
        B = 1j * s / nj
        C = 1j * nj * s
        D = c

        T11 = M11 * A + M12 * C
        T12 = M11 * B + M12 * D
        T21 = M21 * A + M22 * C
        T22 = M21 * B + M22 * D

        M11, M12, M21, M22 = T11, T12, T21, T22

    den = (n0 * M11 + n0 * ns * M12 + M21 + ns * M22)
    r = (n0 * M11 + n0 * ns * M12 - M21 - ns * M22) / den
    t = (2.0 * n0) / den

    R = np.abs(r) ** 2
    T = (np.real(ns) / np.real(n0)) * np.abs(t) ** 2
    return R.real, T.real


# ================= 3) 随机结构（厚度 10–300 nm，步长 10 nm） =================
MATERIALS = ["SiO2", "TiO2", "MgF2"]


def mat_to_narr(name, lam_nm):
    if name == "SiO2":
        return n_sio2(lam_nm)
    elif name == "TiO2":
        return n_tio2(lam_nm)
    elif name == "MgF2":
        return n_mgf2(lam_nm)
    else:
        raise ValueError(f"Unknown material: {name}")


def build_layers(materials, thicknesses_nm, lam_nm, input_order="topdown"):
    """
    将材料名 + 厚度转换为 TMM 需要的 layers 列表。

    参数
    ----
    materials : list[str]
    thicknesses_nm : list[float]
    lam_nm : np.ndarray
    input_order : "topdown" 或 "bottomup"
        - "topdown": materials[0] 是贴空气的顶层
        - "bottomup": materials[0] 是贴玻璃的最底层（内部会翻转）

    返回
    ----
    layers_topdown, mats_topdown, thks_topdown
    """
    if input_order not in ("topdown", "bottomup"):
        raise ValueError("input_order must be 'topdown' or 'bottomup'")

    if input_order == "bottomup":
        materials = list(reversed(materials))
        thicknesses_nm = list(reversed(thicknesses_nm))

    layers = [
        (mat_to_narr(m, lam_nm), float(d))
        for m, d in zip(materials, thicknesses_nm)
    ]
    return layers, materials, thicknesses_nm  # 全部为 topdown 顺序


def rand_stack(rng,
               min_layers=1, max_layers=20,
               tmin=10.0, tmax=300.0, thickness_step=10.0):
    """
    随机生成一组膜系结构。

    约束：
    - 层数 L ∈ [min_layers, max_layers]
    - 厚度 ∈ [10, 300]，且为 thickness_step 的整数倍
    - 相邻两层材料不能相同
    """
    # ---- 随机层数 ----
    L = rng.integers(min_layers, max_layers + 1)

    # ---- 材料序列：相邻不重复 ----
    mats = []
    for i in range(L):
        if i == 0:
            m = rng.choice(MATERIALS)
        else:
            prev = mats[-1]
            candidates = [mat for mat in MATERIALS if mat != prev]
            m = rng.choice(candidates)
        mats.append(m)

    # ---- 厚度序列：10–300，步长 10 ----
    lo = int(np.ceil(tmin / thickness_step))   # = 1
    hi = int(np.floor(tmax / thickness_step))  # = 30
    idx = rng.integers(lo, hi + 1, size=L)
    thks = (idx * thickness_step).astype(float).tolist()

    return mats, thks  # 这里返回的是“生成顺序”，通常当作 topdown 使用


def serialize_tokens(materials_topdown, thicknesses_topdown, fmt=".0f"):
    """
    将膜系序列编码成 token 字符串，如：
    SiO2:120|TiO2:230|MgF2:40|<EoS>

    这里使用的是 topdown 顺序
    """
    spec = fmt.lstrip(":")
    parts = [
        f"{m}:{format(float(t), spec)}"
        for m, t in zip(materials_topdown, thicknesses_topdown)
    ]
    return "|".join(parts) + "|<EoS>"


# ================= 4) 光谱：玻璃背板 + 非相干合成 =================
def simulate_stack_glass_noncoh(front_layers_topdown,
                                lam_nm,
                                n_air=1.0,
                                n_exit=1.0):
    """
    前向/反向 TMM + 玻璃背面菲涅耳反射 + 非相干合成。

    - 所有层均为无吸收介质（k=0）。
    - front_layers_topdown 必须为自顶向下顺序。
    """
    n_sub_arr = n_glass(lam_nm)

    # 正向：Air -> stack -> Glass
    R01, T01 = tmm_normal_rt(front_layers_topdown, lam_nm, n_air, n_sub_arr)

    # 反向：Glass -> reversed(stack) -> Air
    R10, T10 = tmm_normal_rt(list(reversed(front_layers_topdown)),
                             lam_nm, n_sub_arr, n_air)

    # 玻璃-空气 背面菲涅耳反射（正入射常数）
    R_b_scalar = ((glass_n_const - n_exit) / (glass_n_const + n_exit)) ** 2
    R_b = np.full_like(lam_nm, R_b_scalar, dtype=float)

    # 非相干多次反射合成
    R = R01 + (T01 * T10 * R_b) / (1.0 - R10 * R_b)
    T = (T01 * (1.0 - R_b)) / (1.0 - R10 * R_b)

    return R, T


# ================= 5) 数据集生成 =================
def _structure_str(materials_topdown, thicknesses_topdown):
    """
    生成示例中那种结构字段字符串：
    ['SiO2_120', 'TiO2_230', ...]
    """
    tokens = [
        f"{m}_{int(round(float(t)))}"
        for m, t in zip(materials_topdown, thicknesses_topdown)
    ]
    return "[" + ", ".join([f"'{tok}'" for tok in tokens]) + "]"


def structure_key(materials_topdown, thicknesses_topdown):
    """
    用于快速去重的 key：((mat, thickness_int), ...)
    厚度转 int 以避免浮点误差
    """
    return tuple((m, int(round(float(t)))) for m, t in zip(materials_topdown, thicknesses_topdown))


def generate_dataset(num_samples=1000,
                     thickness_step=10.0,
                     input_order="topdown",  # 或 "bottomup"
                     seed=42,
                     output_csv="opto_RT_380_750_like_lastfile.csv"):
    """
    生成数据集，输出为宽表 CSV（去重版）
    """
    print(f"开始生成数据集：{num_samples} 个【去重后】样本")
    print(f"输出文件：{output_csv}")
    wl = make_wavelengths()  # 380–750, 步长 5.0 nm
    rng = np.random.default_rng(seed)

    # 预生成列名
    R_cols = [f"R_{int(round(l))}nm" for l in wl]
    T_cols = [f"T_{int(round(l))}nm" for l in wl]
    final_cols = ["structure", "num_layers"] + R_cols + T_cols

    rows = []
    seen = set()  # 已出现的结构 key

    with tqdm(total=num_samples, desc="生成去重数据集", unit="样本") as pbar:
        while len(rows) < num_samples:
            # 1) 随机膜系结构（厚度 10–300、相邻材料不同）
            mats_in, thks_in = rand_stack(
                rng,
                min_layers=1, max_layers=20,
                tmin=20.0, tmax=300.0,
                thickness_step=thickness_step
            )

            # 2) 统一转换成 topdown 顺序
            layers, mats_topdown, thks_topdown = build_layers(
                mats_in, thks_in, wl, input_order=input_order
            )

            # 2.5) 生成去重 key
            key = structure_key(mats_topdown, thks_topdown)
            if key in seen:
                continue  # 已存在，跳过计算
            seen.add(key)

            # 3) 玻璃背板 + 非相干合成（只对新结构仿真）
            R, T = simulate_stack_glass_noncoh(layers, wl)

            # 4) 行字典
            row = {}
            row["structure"] = _structure_str(mats_topdown, thks_topdown)
            row["num_layers"] = len(mats_topdown)

            for lam, r in zip(wl, R):
                lam_i = int(round(lam))
                row[f"R_{lam_i}nm"] = float(r)

            for lam, t in zip(wl, T):
                lam_i = int(round(lam))
                row[f"T_{lam_i}nm"] = float(t)

            rows.append(row)
            pbar.update(1)

    # DataFrame 严格按目标列顺序
    print(f"\n正在创建 DataFrame（共 {len(rows)} 行，已去重）...")
    df = pd.DataFrame(rows, columns=final_cols)

    # 导出 CSV：索引列名为 Unnamed: 0
    print(f"正在保存到 {output_csv}...")
    df.to_csv(output_csv,
              index=True,
              index_label="Unnamed: 0",
              encoding="utf-8")
    print(f"✓ 保存完成！文件大小: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    return df


# ================= 6) 示例：生成 20000 条并随机预览 100 行 =================
if __name__ == "__main__":
    df = generate_dataset(
        num_samples=200000,
        thickness_step=10.0,
        input_order="bottomup",  # 或 "topdown"
        seed=42,
        output_csv="opto_RT_380_750_like_test.csv"
    )

    # 随机抽 100 行看看
    n_preview = min(100, len(df))
    print("\n=== Preview of {} Random Rows ===".format(n_preview))
    print(df.sample(n_preview, random_state=0))

开始生成数据集：200000 个【去重后】样本
输出文件：opto_RT_380_750_like_test.csv


生成去重数据集: 100%|██████████| 200000/200000 [03:05<00:00, 1075.71样本/s]



正在创建 DataFrame（共 200000 行，已去重）...
正在保存到 opto_RT_380_750_like_test.csv...
✓ 保存完成！文件大小: 266.50 MB

=== Preview of 100 Random Rows ===
                                                structure  num_layers  \
54458                  ['SiO2_50', 'MgF2_60', 'SiO2_100']           3   
118646  ['SiO2_290', 'MgF2_280', 'TiO2_240', 'SiO2_260...           6   
57401   ['TiO2_280', 'MgF2_90', 'SiO2_50', 'TiO2_210',...          19   
115324  ['TiO2_50', 'MgF2_210', 'TiO2_50', 'MgF2_80', ...          18   
71684   ['SiO2_240', 'MgF2_100', 'TiO2_150', 'SiO2_40'...           9   
...                                                   ...         ...   
191922  ['TiO2_60', 'MgF2_80', 'SiO2_180', 'MgF2_70', ...           5   
13054   ['SiO2_240', 'TiO2_230', 'MgF2_90', 'SiO2_110'...          16   
39192     ['TiO2_110', 'MgF2_80', 'TiO2_250', 'SiO2_170']           4   
161588  ['MgF2_150', 'TiO2_40', 'SiO2_250', 'TiO2_200'...           6   
152138  ['MgF2_140', 'TiO2_90', 'SiO2_170', 'TiO2_260'...       