In [None]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
from google.colab import files
uploaded = files.upload()

INPUT_PATH = next(fn for fn in uploaded.keys() if fn.lower().endswith(('.xlsx', '.xls')))
print(f"[INFO] 사용 파일: {INPUT_PATH}")

Saving 202501_clean2.xlsx to 202501_clean2.xlsx
[INFO] 사용 파일: 202501_clean2.xlsx


In [None]:
# 대상 금속
TARGET_METALS = ['Cr(ng/m3)', 'Co(ng/m3)', 'Ni(ng/m3)',
                 'As(ng/m3)', 'Cd(ng/m3)', 'Sb(ng/m3)', 'Pb(ng/m3)']

# 출력 리스트
DIST_LIST = ["로그 정규","감마","최대 극값","와이블","로지스틱","삼각형","스튜던트의 t",
             "정규","최소 극값","지수","파레토","균일","BetaPERT","베타"]

OUTPUT_PATH = "Tx-적합도.xlsx"

In [None]:
# 유틸/검정 함수
def get_distributions_extended():
    return {"로그 정규":   stats.lognorm,
            "감마":       stats.gamma,
            "최대 극값":   stats.gumbel_r,
            "와이블":     stats.weibull_min,
            "로지스틱":    stats.logistic,
            "삼각형":     stats.triang,
            "스튜던트의 t": stats.t,
            "정규":       stats.norm,
            "최소 극값":   stats.gumbel_l,
            "지수":       stats.expon,
            "파레토":     stats.pareto,
            "균일":       stats.uniform,
            "베타":       stats.beta}

def select_data_for_dist(x: np.ndarray, name: str) -> np.ndarray:
    x = x[~np.isnan(x)]
    if name in ("로그 정규","감마","와이블","지수","파레토","베타","BetaPERT"):
        x = x[x > 0]
    return x

def fit_params(dist, x: np.ndarray, name: str):
    try:
        if name in ("로그 정규","감마","와이블","지수","파레토"):
            try:
                params = dist.fit(x, floc=0)  # 먼저 loc=0으로 고정 시도
            except Exception:
                params = dist.fit(x)
        else:
            params = dist.fit(x)
        return params, None
    except Exception as e:
        return None, f"fit_error: {e}"

def ks_test(dist, params, x: np.ndarray):
    try:
        D, p = stats.kstest(x, dist.cdf, args=params)
        return float(D), float(p), None
    except Exception as e:
        return np.nan, np.nan, f"ks_error: {e}"

def chi_square_gof(dist, params, x: np.ndarray, min_expected=5, bins_min=5, bins_max=20):
    n = x.size
    if n < 10:
        return np.nan, np.nan, "chi_error: n<10"
    bins0 = int(np.clip(int(np.sqrt(n)), bins_min, bins_max))
    try:
        hist, edges = np.histogram(x, bins=bins0)
    except Exception as e:
        return np.nan, np.nan, f"chi_hist_error: {e}"

    cdfs = dist.cdf(edges, *params)
    expected = n * np.diff(cdfs)

    counts = hist.astype(float).tolist()
    expct  = expected.astype(float).tolist()
    edges_list = edges.tolist()

    def merge_bins(counts, expct, edges_list):
        i = 0
        while i < len(expct):
            if expct[i] < min_expected:
                if i == 0:
                    counts[i+1] += counts[i]; expct[i+1] += expct[i]
                    edges_list.pop(i+1); counts.pop(i); expct.pop(i)
                elif i == len(expct) - 1:
                    counts[i-1] += counts[i]; expct[i-1] += expct[i]
                    edges_list.pop(i); counts.pop(i); expct.pop(i); i -= 1
                else:
                    if expct[i-1] < expct[i+1]:
                        counts[i-1] += counts[i]; expct[i-1] += expct[i]
                        edges_list.pop(i); counts.pop(i); expct.pop(i); i -= 1
                    else:
                        counts[i+1] += counts[i]; expct[i+1] += expct[i]
                        edges_list.pop(i+1); counts.pop(i); expct.pop(i)
            else:
                i += 1
        return counts, expct, edges_list

    try:
        counts, expct, edges_list = merge_bins(counts, expct, edges_list)
        if any(e < min_expected for e in expct) or len(expct) < 2:
            return np.nan, np.nan, "chi_error: insufficient expected after merge"
        obs = np.array(counts, dtype=float)
        exp = np.array(expct, dtype=float)
        chi_stat = ((obs - exp) ** 2 / exp).sum()
        dof = max(len(exp) - 1 - (len(params)), 1)
        pval = 1 - stats.chi2.cdf(chi_stat, dof)
        return float(chi_stat), float(pval), None
    except Exception as e:
        return np.nan, np.nan, f"chi_error: {e}"

def anderson_darling_stat(dist, params, x: np.ndarray):
    x = np.sort(x)
    n = x.size
    if n < 5:
        return np.nan, np.nan, "ad_error: n<5"
    F = dist.cdf(x, *params)
    F = np.clip(F, 1e-12, 1-1e-12)
    i = np.arange(1, n+1)
    A2 = -n - np.mean((2*i - 1) * (np.log(F) + np.log(1 - F[::-1])))
    return float(A2), np.nan, None  # p-값은 --- 처리

# BetaPERT: (min, mode, max)을 Beta(loc=a, scale=b-a) 매핑
def betapert_alpha_beta(a, m, b, lamb=4.0):
    if not (a < m < b):
        return None, None
    alpha = 1.0 + lamb * (m - a) / (b - a)
    beta  = 1.0 + lamb * (b - m) / (b - a)
    if alpha <= 0 or beta <= 0:
        return None, None
    return alpha, beta

def fit_betapert(x: np.ndarray, lamb=4.0):
    a = float(np.min(x)); b = float(np.max(x))
    hist, edges = np.histogram(x, bins='auto')
    idx = int(np.argmax(hist))
    m = float(0.5 * (edges[idx] + edges[idx+1]))
    alpha, beta = betapert_alpha_beta(a, m, b, lamb=lamb)
    if alpha is None:
        return None, "betapert_error: invalid (a,m,b)", None
    return (alpha, beta, a, (b - a)), None, m  # beta 포맷, 에러없음, 모드 m 반환

In [None]:
# 매개변수 라벨링
import math
def fmt(x, digits=6):
    try:
        return f"{float(x):.{digits}g}"
    except Exception:
        return str(x)

def describe_params_leftstyle(name: str, params: tuple, extra=None) -> str:
    if name == "정규":
        loc, scale = params
        return f"평균={fmt(loc)}, 표준 편 차={fmt(scale)}"
    if name == "로지스틱":
        loc, scale = params
        return f"평균={fmt(loc)}, 스케일={fmt(scale)}"
    if name in ("최대 극값","최소 극값"):
        loc, scale = params
        return f"최고가능성={fmt(loc)}, 스케일={fmt(scale)}"
    if name == "로그 정규":
        s, loc, scale = params  # scale=exp(mu)
        mu_log = math.log(scale) if scale > 0 else float("nan")
        # shift 포함 평균/표준편차
        mean = loc + math.exp(mu_log + s*s/2.0)
        std  = math.sqrt((math.exp(s*s)-1.0) * math.exp(2*mu_log + s*s))
        return f"평균={fmt(mean)}, 표준 편 차={fmt(std)}, 위치={fmt(loc)}"
    if name == "와이블":
        c, loc, scale = params
        return f"위치={fmt(loc)}, 스케일={fmt(scale)}, 형태={fmt(c)}"
    if name == "감마":
        a, loc, scale = params
        return f"위치={fmt(loc)}, 스케일={fmt(scale)}, 형태={fmt(a)}"
    if name == "지수":
        loc, scale = params
        lam = (1.0/scale) if (scale not in (0, float('inf'))) else float('nan')
        return f"비율={fmt(lam)}"
    if name == "파레토":
        b, loc, scale = params
        return f"위치={fmt(loc)}, 형태={fmt(b)}"
    if name == "균일":
        loc, scale = params
        a = loc; b = loc + scale
        return f"최소={fmt(a)}, 최대={fmt(b)}"
    if name == "스튜던트의 t":
        df, loc, scale = params
        return f"중간점={fmt(loc)}, 스케일={fmt(scale)}, 자유도={fmt(df)}"
    if name == "삼각형":
        c, loc, scale = params
        a = loc; b = loc + scale; mode = a + c*(b-a)
        return f"최소={fmt(a)}, 최고가능성={fmt(mode)}, 최대={fmt(b)}"
    if name == "베타":
        a, b, loc, scale = params
        left = loc; right = loc + scale
        return f"최소={fmt(left)}, 최대={fmt(right)}, 알파={fmt(a)}, 베타={fmt(b)}"
    if name == "BetaPERT":
        a, b, loc, scale = params
        left = loc; right = loc + scale

        lamb = 4.0
        mode = left + (a - 1.0)/lamb * (right - left)
        return f"최소={fmt(left)}, 최고가능성={fmt(mode)}, 최대={fmt(right)}"
    return ", ".join([f"p{i}={fmt(p)}" for i,p in enumerate(params, start=1)])

In [None]:
def sort_by_goodness(res_df: pd.DataFrame, alpha_ks: float = 0.05, alpha_chi: float = 0.05) -> pd.DataFrame:
    tmp = res_df.copy()
    for c in ["A-D","A-D P-값","K-S","K-S P-값","카이제곱","카이제곱 P-값"]:
        tmp[c] = pd.to_numeric(tmp[c], errors="coerce")

    # 결측 패널티
    tmp["_ks_p"]  = tmp["K-S P-값"].fillna(-1)
    tmp["_chi_p"] = tmp["카이제곱 P-값"].fillna(-1)
    tmp["_ad"]    = tmp["A-D"].fillna(np.inf)
    tmp["_ks"]    = tmp["K-S"].fillna(np.inf)
    tmp["_chi"]   = tmp["카이제곱"].fillna(np.inf)

    # 그룹: 0=KS&χ² 통과, 1=KS만 통과, 2=둘 다 미통과
    ks_pass  = tmp["_ks_p"]  >= alpha_ks
    chi_pass = tmp["_chi_p"] >= alpha_chi
    tmp["_group"] = np.where(ks_pass & chi_pass, 0, np.where(ks_pass, 1, 2))

     # 그룹 -> KS p(내림) -> χ² p(내림) -> A-D(오름) -> K-S(오름) -> χ²(오름)
    tmp = tmp.sort_values(
        by=["_group","_ks_p","_chi_p","_ad","_ks","_chi"],
        ascending=[ True,   False,   False,   True, True, True])
    return tmp.drop(columns=["_group","_ks_p","_chi_p","_ad","_ks","_chi"])

In [None]:
# 데이터 로드 & 컬럼 체크
df = pd.read_excel(INPUT_PATH)
df.columns = (df.columns
              .str.replace('\u200b', '', regex=False)  # 제로폭문자 제거
              .str.strip())

name_map = {'Cr (ng/m3)': 'Cr(ng/m3)','Cr(ng/m³)': 'Cr(ng/m3)',
            'Co (ng/m3)': 'Co(ng/m3)','Co(ng/m³)': 'Co(ng/m3)',
            'Ni (ng/m3)': 'Ni(ng/m3)','Ni(ng/m³)': 'Ni(ng/m3)',
            'As (ng/m3)': 'As(ng/m3)','As(ng/m³)': 'As(ng/m3)',
            'Cd (ng/m3)': 'Cd(ng/m3)','Cd(ng/m³)': 'Cd(ng/m3)',
            'Sb (ng/m3)': 'Sb(ng/m3)','Sb(ng/m³)': 'Sb(ng/m3)',
            'Pb (ng/m3)': 'Pb(ng/m3)','Pb(ng/m³)': 'Pb(ng/m3)',}

# 매핑 적용
df.rename(columns=name_map, inplace=True)

# 누락 체크
missing = [m for m in TARGET_METALS if m not in df.columns]
if missing:
    raise ValueError(f"원본에 없는 컬럼: {missing}")

In [None]:
# 적합도 계산 및 금속별 정렬
dist_map = get_distributions_extended()
results_by_metal = {}

for metal in TARGET_METALS:
    raw = pd.to_numeric(df[metal], errors='coerce').to_numpy()
    recs = []

    for name in DIST_LIST:
        x = select_data_for_dist(raw.copy(), name)
        if x.size < 5:
            recs.append([name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, "표본수 부족(n<5)"])
            continue

        if name == "BetaPERT":
            params, fit_err, mode_est = fit_betapert(x, lamb=4.0)
            dist = stats.beta
        else:
            dist = dist_map[name]
            params, fit_err = fit_params(dist, x, name)
            mode_est = None

        if fit_err or params is None:
            recs.append([name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, fit_err or "fit_error"])
            continue

        ad_stat, ad_p, _    = anderson_darling_stat(dist, params, x)
        ks_D, ks_p, _       = ks_test(dist, params, x)
        chi_stat, chi_p, _  = chi_square_gof(dist, params, x)

        param_str = describe_params_leftstyle(name, params, extra={"mode_est": mode_est})
        ad_p_str = np.nan  # A-D p-값은 미계산 '---' 표시 예정

        recs.append([name, ad_stat, ad_p_str, ks_D, ks_p, chi_stat, chi_p, param_str])

    res_df = pd.DataFrame(
        recs,
        columns=["분포","A-D","A-D P-값","K-S","K-S P-값","카이제곱","카이제곱 P-값","매개 변수"])

    # 금속별 정렬 적용
    res_df_sorted = sort_by_goodness(res_df, alpha_ks=0.05, alpha_chi=0.05)
    results_by_metal[metal] = res_df_sorted

  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)


In [None]:
# 엑셀 레이아웃 생성
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.worksheet.table import Table, TableStyleInfo

wb = Workbook()
ws = wb.active
ws.title = "Tx-적합도"

thin = Side(style="thin", color="000000")
border_all = Border(top=thin, bottom=thin, left=thin, right=thin)

header_fill = PatternFill("solid", fgColor="305496")  # 남색 헤더
header_font = Font(bold=True, color="FFFFFF")
metal_font  = Font(bold=True)
wrap = Alignment(wrapText=True, vertical="top")

# 컬럼 폭
ws.column_dimensions["B"].width = 14   # 분포
ws.column_dimensions["C"].width = 9
ws.column_dimensions["D"].width = 11
ws.column_dimensions["E"].width = 9
ws.column_dimensions["F"].width = 11
ws.column_dimensions["G"].width = 11
ws.column_dimensions["H"].width = 13
ws.column_dimensions["I"].width = 55   # 매개 변수
ws.column_dimensions["A"].width = 9    # 금속 레이블

r = 1  # 현재 행
table_idx = 1

def metal_short(m):
    # "Fe(ng/m3)"는 "Fe"
    return m.split("(")[0].strip()

for metal in TARGET_METALS:
    # 금속 레이블
    ws.cell(row=r, column=1, value=metal_short(metal)).font = metal_font
    r += 1

    # 섹션 헤더
    headers = ["분포","A-D","A-D P-값","K-S","K-S P-값","카이제곱","카이제곱 P-값","매개 변수"]
    for j, h in enumerate(headers, start=2):
        c = ws.cell(row=r, column=j, value=h)
        c.fill = header_fill; c.font = header_font; c.alignment = Alignment(horizontal="center")
        c.border = border_all

    # 데이터 행
    df_sec = results_by_metal[metal].copy()
    # NaN → '---' 변환(표시용)
    df_sec = df_sec.replace({np.nan: '---'})

    start_row = r + 1
    for i in range(len(df_sec)):
        rowvals = df_sec.iloc[i].tolist()
        for j, v in enumerate(rowvals, start=2):
            cell = ws.cell(row=start_row + i, column=j, value=v)
            cell.border = border_all
            if j == 9:
                cell.alignment = wrap

    end_row = start_row + len(df_sec) - 1

    # 섹션별 테이블
    tbl_ref = f"B{r}:I{end_row if end_row>=r else r}"
    table = Table(displayName=f"T_{table_idx}", ref=tbl_ref)
    style = TableStyleInfo(name="TableStyleMedium9", showFirstColumn=False,
                           showLastColumn=False, showRowStripes=True, showColumnStripes=False)
    table.tableStyleInfo = style
    ws.add_table(table)
    table_idx += 1

    # 섹션 간 공백줄
    r = end_row + 2

# 숫자 서식
for row in ws.iter_rows(min_row=1, max_row=ws.max_row, min_col=3, max_col=8):
    for cell in row:
        if isinstance(cell.value, (float, int)):
            cell.number_format = "0.000000"

In [None]:
# 엑셀 저장 및 다운
wb.save(OUTPUT_PATH)
print("[완료] 저장:", OUTPUT_PATH)
files.download(OUTPUT_PATH)

[완료] 저장: Tx-적합도.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 부트스트랩 루프 사용

In [2]:
from google.colab import files
uploaded = files.upload()

import numpy as np, pandas as pd, json, re, os
from scipy import stats
from scipy.stats import (gumbel_r, gumbel_l, lognorm, weibull_min, logistic, t,
                         norm, gamma, beta, triang, expon, pareto, uniform, chi2)
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.worksheet.table import Table, TableStyleInfo

# Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
# (Colab 메시지)
# 업로드된 파일 이름 선택
def _pick_xlsx(uploaded_dict):
    for k in uploaded_dict.keys():
        if str(k).lower().endswith(('.xlsx', '.xls')):
            return k
    raise FileNotFoundError("업로드된 엑셀 파일(.xlsx/.xls)을 찾지 못했습니다.")
DATA_PATH = _pick_xlsx(uploaded)
OUTPUT_PATH = 'Tx-적합도.xlsx'

np.random.seed(42)
BOOTSTRAP_B = 300            # 부트스트랩 반복(속도 필요시 120~200 권장)
GROUP_GAP = 1                # 금속 블록 사이 공백 행 수
HEADER_BLUE = '2F5597'       # 헤더 색상
BEST_FILL = 'FFF2CC'         # 1순위 행 하이라이트

# -----------------------------
# 유틸 함수
# -----------------------------
def detect_numeric_metal_columns(df):
    exclude_tokens = {'date','time','pump','begin','start','end','id','index','sample','unit',
                      'yyyymm','yyyy','mm','dd','hour','minute','sec'}
    cols = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            name = str(c).lower()
            if not any(tok in name for tok in exclude_tokens):
                cols.append(c)
    return cols

def to_numpy_clean(x):
    return pd.Series(x, dtype='float').replace([np.inf, -np.inf], np.nan).dropna().values

def hist_mode_estimate(x):
    x = np.asarray(x); n = len(x)
    if n < 2: return float(np.median(x))
    iqr = np.subtract(*np.percentile(x,[75,25]))
    bins = int(np.ceil(np.sqrt(n))) if iqr <= 0 else max(10, int(np.ceil((x.max()-x.min())/(2*iqr*n**(-1/3)))))
    cnt, edges = np.histogram(x, bins=bins); i = cnt.argmax()
    return float((edges[i] + edges[i+1]) / 2)

# -----------------------------
# 분포 래퍼
# -----------------------------
class W:
    def __init__(self, name): self.name=name; self.p={}; self.np=None; self.valid=True
    def _fail(self): self.valid=False; return self

class GumbelR(W):
    def __init__(self): super().__init__('최대 극값')
    def fit(self,x):
        try: loc,sc=gumbel_r.fit(x); assert np.isfinite(loc) and sc>0
        except: return self._fail()
        self.p=dict(loc=loc,scale=sc); self.np=2; return self
    def cdf(self,z): return gumbel_r.cdf(z,**self.p)
    def ppf(self,q): return gumbel_r.ppf(q,**self.p)
    def rvs(self,n): return gumbel_r.rvs(size=n,**self.p, random_state=42)

class LogNormal(W):
    def __init__(self): super().__init__('로그 정규')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: s,loc,sc=lognorm.fit(x, floc=0); assert s>0 and sc>0
        except: return self._fail()
        self.p=dict(shape=s,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return lognorm.cdf(z, s=self.p['shape'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return lognorm.ppf(q, s=self.p['shape'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return lognorm.rvs(s=self.p['shape'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Weibull(W):
    def __init__(self): super().__init__('와이블')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: c,loc,sc=weibull_min.fit(x, floc=0); assert c>0 and sc>0
        except: return self._fail()
        self.p=dict(c=c,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return weibull_min.cdf(z, c=self.p['c'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return weibull_min.ppf(q, c=self.p['c'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return weibull_min.rvs(c=self.p['c'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Logistic(W):
    def __init__(self): super().__init__('로지스틱')
    def fit(self,x):
        try: loc,sc=logistic.fit(x); assert sc>0
        except: return self._fail()
        self.p=dict(loc=loc,scale=sc); self.np=2; return self
    def cdf(self,z): return logistic.cdf(z,**self.p)
    def ppf(self,q): return logistic.ppf(q,**self.p)
    def rvs(self,n): return logistic.rvs(size=n,**self.p, random_state=42)

class StudentT(W):
    def __init__(self): super().__init__('스튜던트의 t')
    def fit(self,x):
        x = np.asarray(x, float); x = x[np.isfinite(x)]
        if x.size < 5 or np.allclose(x.min(), x.max()): return self._fail()
        try:
            df_,loc,sc = t.fit(x)
            if not(np.isfinite(df_*loc*sc)) or sc<=0 or df_<=0: raise RuntimeError
        except Exception:
            try:
                df_,loc,sc = t.fit(x, floc=float(np.median(x)))
                if not(np.isfinite(df_*loc*sc)) or sc<=0 or df_<=0: raise RuntimeError
            except Exception:
                try:
                    df_fixed = 30.0
                    _df,loc,sc = t.fit(x, fdf=df_fixed)
                    df_ = df_fixed
                    if not np.isfinite(loc*sc) or sc<=0: return self._fail()
                except Exception:
                    return self._fail()
        self.p=dict(df=df_,loc=loc,scale=sc); self.np=3; return self
    def cdf(self,z): return t.cdf(z, **self.p)
    def ppf(self,q): return t.ppf(q, **self.p)
    def rvs(self,n): return t.rvs(size=n, **self.p, random_state=42)

class Normal(W):
    def __init__(self): super().__init__('정규')
    def fit(self,x):
        try: mu,sig=norm.fit(x); assert sig>0
        except: return self._fail()
        self.p=dict(loc=mu,scale=sig); self.np=2; return self
    def cdf(self,z): return norm.cdf(z,**self.p)
    def ppf(self,q): return norm.ppf(q,**self.p)
    def rvs(self,n): return norm.rvs(size=n,**self.p, random_state=42)

class Gamma_(W):
    def __init__(self): super().__init__('감마')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: a,loc,sc=gamma.fit(x, floc=0); assert a>0 and sc>0
        except: return self._fail()
        self.p=dict(a=a,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return gamma.cdf(z, a=self.p['a'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return gamma.ppf(q, a=self.p['a'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return gamma.rvs(a=self.p['a'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Beta_(W):
    def __init__(self): super().__init__('베타')
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        z = np.clip((x-a)/(b-a), 1e-9, 1-1e-9)
        try: al,be,_,_ = beta.fit(z, floc=0, fscale=1); assert al>0 and be>0
        except: return self._fail()
        self.p=dict(alpha=al,beta=be,a=a,b=b); self.np=2; return self
        # 주의: 베타는 데이터 선형변환 포함(최소~최대)
    def _z(self,z): return (z-self.p['a'])/(self.p['b']-self.p['a'])
    def cdf(self,z): return beta.cdf(self._z(z), self.p['alpha'], self.p['beta'])
    def ppf(self,q): return self.p['a']+(self.p['b']-self.p['a'])*beta.ppf(q, self.p['alpha'], self.p['beta'])
    def rvs(self,n):
        r=beta.rvs(self.p['alpha'], self.p['beta'], size=n, random_state=42)
        return self.p['a']+(self.p['b']-self.p['a'])*r

class Triangular_(W):
    def __init__(self): super().__init__('삼각형')
    def fit(self, x):
        a, b = float(np.min(x)), float(np.max(x))
        if not (np.isfinite(a) and np.isfinite(b) and b > a):
            return self._fail()
        m = float(np.clip(hist_mode_estimate(x), a + 1e-9, b - 1e-9))
        c = (m - a) / (b - a)
        if not (0 < c < 1):
            return self._fail()
        self.p = dict(a=a, m=m, b=b, c=c); self.np = 3
        return self
    def cdf(self, z):
        return triang.cdf(z, c=self.p['c'], loc=self.p['a'], scale=(self.p['b'] - self.p['a']))
    def ppf(self, q):
        return triang.ppf(q, c=self.p['c'], loc=self.p['a'], scale=(self.p['b'] - self.p['a']))
    def rvs(self, n):
        return triang.rvs(self.p['c'], loc=self.p['a'], scale=(self.p['b'] - self.p['a']), size=n, random_state=42)

class GumbelL(W):
    def __init__(self): super().__init__('최소 극값')
    def fit(self, x):
        try:
            loc, sc = gumbel_l.fit(x); assert sc > 0
        except:
            return self._fail()
        self.p = dict(loc=loc, scale=sc); self.np = 2
        return self
    def cdf(self, z): return gumbel_l.cdf(z, **self.p)
    def ppf(self, q): return gumbel_l.ppf(q, **self.p)
    def rvs(self, n): return gumbel_l.rvs(size=n, **self.p, random_state=42)

class Exponential_(W):
    def __init__(self): super().__init__('지수')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: loc,sc=expon.fit(x, floc=0); assert sc>0
        except: return self._fail()
        self.p=dict(loc=0.0,scale=sc); self.np=1; return self
    def cdf(self,z): return expon.cdf(z,**self.p)
    def ppf(self,q): return expon.ppf(q,**self.p)
    def rvs(self,n): return expon.rvs(size=n,**self.p, random_state=42)

class Pareto_(W):
    def __init__(self): super().__init__('파레토')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: b,loc,sc=pareto.fit(x, floc=0); assert b>0 and sc>0
        except: return self._fail()
        self.p=dict(b=b,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return pareto.cdf(z,**self.p)
    def ppf(self,q): return pareto.ppf(q,**self.p)
    def rvs(self,n): return pareto.rvs(self.p['b'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class BetaPERT_(W):
    def __init__(self,lam=4.0): super().__init__('BetaPERT'); self.lam=lam
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        m = float(np.clip(hist_mode_estimate(x), a+1e-9, b-1e-9))
        al = 1 + self.lam*(m-a)/(b-a); be = 1 + self.lam*(b-m)/(b-a)
        if not(al>0 and be>0): return self._fail()
        self.p=dict(alpha=al,beta=be,a=a,b=b,m=m,lam=self.lam); self.np=2; return self
    def cdf(self,z):
        z0=(z-self.p['a'])/(self.p['b']-self.p['a'])
        return beta.cdf(z0, self.p['alpha'], self.p['beta'])
    def ppf(self,q):
        return self.p['a']+(self.p['b']-self.p['a'])*beta.ppf(q, self.p['alpha'], self.p['beta'])
    def rvs(self,n):
        r=beta.rvs(self.p['alpha'], self.p['beta'], size=n, random_state=42)
        return self.p['a']+(self.p['b']-self.p['a'])*r

class Uniform_(W):
    def __init__(self): super().__init__('균일')
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        self.p=dict(loc=a,scale=(b-a)); self.np=2; return self
    def cdf(self,z): return uniform.cdf(z,**self.p)
    def ppf(self,q): return uniform.ppf(q,**self.p)
    def rvs(self,n): return uniform.rvs(size=n,**self.p, random_state=42)

# -----------------------------
# 지표
# -----------------------------
def AD_stat(x, cdf, eps=1e-12):
    x=np.sort(x); n=len(x); u=np.clip(cdf(x),eps,1-eps); i=np.arange(1,n+1)
    return float(-n - np.sum((2*i-1)*(np.log(u)+np.log(1-u[::-1])))/n)

# 수정: 모수 재적합 부트스트랩 기반 A-D p값
def AD_p_boot_refit(x, dist_obj, B=BOOTSTRAP_B):
    """
    Parametric bootstrap p-value for Anderson–Darling with parameter refit.
    매 반복: 샘플링→재적합→AD 계산. (Minitab 방식과 정합성)
    """
    x = np.asarray(x, float)
    n = len(x)
    if n < 5 or not dist_obj.valid:
        return np.nan

    A2_obs = AD_stat(x, dist_obj.cdf)
    ge = 0
    m  = 0
    for _ in range(B):
        xs = dist_obj.rvs(n)
        d_bs = type(dist_obj)().fit(xs)
        if not d_bs or not d_bs.valid:
            continue
        A2_bs = AD_stat(xs, d_bs.cdf)
        ge += (A2_bs >= A2_obs)
        m  += 1
    if m == 0:
        return np.nan
    return float((ge + 1) / (m + 1))  # 작은 샘플 보정

def KS_stat_p(x, dist):
    try:
        D,p = stats.kstest(x, lambda z: dist.cdf(z))
        return float(D), float(p)
    except Exception:
        return np.nan, np.nan

def Chi2_stat_p(x, dist):
    try:
        n=len(x); N=max(5, min(50, n//5)); eps=1e-6
        qs=np.linspace(eps, 1-eps, N+1); edges=np.unique(dist.ppf(qs))
        if len(edges) < 3: return np.nan, np.nan
        obs,_ = np.histogram(x, bins=edges); exp = np.diff(qs)*n
        k = dist.np or 0; df = len(obs)-1-k
        if df <= 0: return np.nan, np.nan
        exp = np.maximum(exp[:len(obs)], 1e-9)
        chi = np.sum((obs-exp)**2/exp); p = 1.0 - chi2.cdf(chi, df)
        return float(chi), float(p)
    except Exception:
        return np.nan, np.nan

# -----------------------------
# 파라미터 문자열
# -----------------------------
def pstr(name,p):
    try:
        if name=='로그 정규': return f"형태={p['shape']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='와이블':   return f"형태={p['c']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='감마':     return f"형태={p['a']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='지수':     return f"비율={1.0/p['scale']:.5g}"
        if name in ['최대 극값','최소 극값','정규','로지스틱']:
            lab='최고가능성' if '극값' in name else '평균'
            return f"{lab}={p['loc']:.5g}, 스케일={p['scale']:.5g}"
        if name=='스튜던트의 t': return f"중간점={p['loc']:.5g}, 스케일={p['scale']:.5g}, 자유도={p['df']:.5g}"
        if name=='파레토':   return f"위치=0, 스케일={p['scale']:.5g}, 형태={p['b']:.5g}"
        if name=='베타':     return f"최소={p['a']:.5g}, 최대={p['b']:.5g}, 알파={p['alpha']:.5g}, 베타={p['beta']:.5g}"
        if name=='BetaPERT': return f"최소={p['a']:.5g}, 최고가능성={p['m']:.5g}, 최대={p['b']:.5g}, α={p['alpha']:.5g}, β={p['beta']:.5g}"
        if name=='삼각형':   return f"최소={p['a']:.5g}, 최고가능성={p['m']:.5g}, 최대={p['b']:.5g}"
        if name=='균일':     return f"최소={p['loc']:.5g}, 최대={(p['loc']+p['scale']):.5g}"
        return json.dumps(p, ensure_ascii=False)
    except Exception:
        return json.dumps(p, ensure_ascii=False)

# -----------------------------
# 적합 & 정렬
# -----------------------------
POSITIVE = {'로그 정규','와이블','감마','지수','파레토'}

def fit_one_col(x_raw):
    x = to_numpy_clean(x_raw)
    if len(x) < 20:
        return pd.DataFrame(columns=['분포','A-D','A-D P-값','K-S','K-S P-값','카이제곱','카이제곱 P-값','매개 변수',
                                     '_AD','_KS_p','_Chi2_p','_n_params'])
    rows=[]
    for d in [GumbelR(), LogNormal(), Weibull(), Logistic(), StudentT(), Normal(),
              Gamma_(), Beta_(), Triangular_(), GumbelL(), Exponential_(),
              Pareto_(), BetaPERT_(), Uniform_()]:
        d = d.fit(x)
        if not d or not d.valid:
            rows.append({'분포':d.name,'A-D':np.nan,'A-D P-값':np.nan,'K-S':np.nan,'K-S P-값':np.nan,
                         '카이제곱':np.nan,'카이제곱 P-값':np.nan,'매개 변수':pstr(d.name, getattr(d,'p',{})),
                         '_AD':np.inf,'_KS_p':-np.inf,'_Chi2_p':-np.inf,'_n_params':1e9})
            continue
        xe = x[x>0] if d.name in POSITIVE else x
        if len(xe) < 5:
            rows.append({'분포':d.name,'A-D':np.nan,'A-D P-값':np.nan,'K-S':np.nan,'K-S P-값':np.nan,
                         '카이제곱':np.nan,'카이제곱 P-값':np.nan,'매개 변수':pstr(d.name,d.p),
                         '_AD':np.inf,'_KS_p':-np.inf,'_Chi2_p':-np.inf,'_n_params':d.np or 1e9})
            continue
        A2  = AD_stat(xe, d.cdf)
        # 수정: 재적합 부트스트랩 p값
        pAD = AD_p_boot_refit(xe, d, B=BOOTSTRAP_B)
        D,p = KS_stat_p(xe, d);      chi,pc = Chi2_stat_p(xe, d)
        rows.append({'분포':d.name,'A-D':A2,'A-D P-값':pAD,'K-S':D,'K-S P-값':p,
                     '카이제곱':chi,'카이제곱 P-값':pc,'매개 변수':pstr(d.name,d.p),
                     '_AD':A2,'_KS_p':p,'_Chi2_p':pc,'_n_params':d.np or 1e9})
    df = pd.DataFrame(rows)
    # 정렬키: A-D ↑, KS p ↓, χ² p ↓, # params ↑
    df['_key'] = list(zip(df['_AD'].apply(lambda v: v if np.isfinite(v) else np.inf),
                          df['_KS_p'].apply(lambda v: -v if np.isfinite(v) else np.inf),
                          df['_Chi2_p'].apply(lambda v: -v if np.isfinite(v) else np.inf),
                          df['_n_params'].apply(lambda v: v if np.isfinite(v) else np.inf)))
    return df.sort_values('_key', kind='mergesort').drop(columns=['_key']).reset_index(drop=True)

# -----------------------------
# 엑셀 작성
# -----------------------------
def set_col_widths(ws):
    widths = {'A':18,'B':12,'C':10,'D':10,'E':10,'F':10,'G':12,'H':12,'I':64}
    for col,w in widths.items():
        ws.column_dimensions[col].width = w

def style_header(ws, row, headers):
    blue = PatternFill('solid', fgColor=HEADER_BLUE)
    white = Font(color='FFFFFF', bold=True)
    center = Alignment(horizontal='center', vertical='center', wrap_text=True)
    thin = Border(left=Side(style='thin', color='999999'),
                  right=Side(style='thin', color='999999'),
                  top=Side(style='thin', color='999999'),
                  bottom=Side(style='thin', color='999999'))
    for j,h in enumerate(headers, start=2):  # B..I
        c = ws.cell(row=row, column=j, value=h)
        c.fill = blue; c.font = white; c.alignment = center; c.border = thin

def write_num(ws, r, c, v):
    cell = ws.cell(row=r, column=c)
    if isinstance(v,(float,np.floating)) and np.isfinite(v):
        cell.value = float(v); cell.number_format = '0.0000'
    elif v is None or (isinstance(v,(float,np.floating)) and (np.isnan(v) or np.isinf(v))):
        cell.value = '---'
    else:
        cell.value = v

# -----------------------------
# 실행
# -----------------------------
df = pd.read_excel(DATA_PATH)

# 금속 후보 탐지 & Con(Concentration) 제거 (정규식 보정)
EXCLUDE_RE = re.compile(r'^\s*con(c|centration)?\s*(\(ug/m3\))?\s*$', re.I)
metals = detect_numeric_metal_columns(df)
metals = [m for m in metals if not EXCLUDE_RE.match(str(m))]
if not metals:
    raise ValueError('숫자형(금속) 컬럼을 찾지 못했습니다.')

# 적합
results = {m: fit_one_col(df[m].values) for m in metals}

# 워크북
wb = Workbook()
ws = wb.active
ws.title = '데이터 계열'
ws['A1'] = '순위 지정 기준: 앤더슨-달링'
ws['A2'] = '데이터 계열'
set_col_widths(ws)
ws.freeze_panes = 'B3'

headers = ['분포','A-D','A-D P-값','K-S','K-S P-값','카이제곱','카이제곱 P-값','매개 변수']
row = 3
table_idx = 1
last_data_row = row

for m in metals:
    # 1) 타이틀
    ws.cell(row=row, column=1, value=m).font = Font(bold=True)
    row += 1

    # 2) 헤더
    for j, h in enumerate(headers, start=2):  # B..I
        ws.cell(row=row, column=j, value=h)
    start_row = row
    row += 1

    # 3) 데이터 값
    tbl = results[m]
    for i, r_ in tbl.iterrows():
        rr = row + i
        ws.cell(row=rr, column=2, value=r_['분포'])
        write_num(ws, rr, 3, r_['A-D'])
        write_num(ws, rr, 4, r_['A-D P-값'])
        write_num(ws, rr, 5, r_['K-S'])
        write_num(ws, rr, 6, r_['K-S P-값'])
        write_num(ws, rr, 7, r_['카이제곱'])
        write_num(ws, rr, 8, r_['카이제곱 P-값'])
        ws.cell(row=rr, column=9, value=r_['매개 변수'])

    end_row = row + len(tbl) - 1
    last_data_row = max(last_data_row, end_row)

    # 4) 테이블 스타일
    ref = f"B{start_row}:I{end_row}"
    t = Table(displayName=f"T_{table_idx}", ref=ref)
    t.tableStyleInfo = TableStyleInfo(name="TableStyleMedium9",
                                      showFirstColumn=False,
                                      showLastColumn=False,
                                      showRowStripes=True,
                                      showColumnStripes=False)
    ws.add_table(t)
    table_idx += 1

    # 5) 다음 블록
    row = end_row + 2

# 저장 & 다운로드
wb.save(OUTPUT_PATH)
print('Saved:', OUTPUT_PATH)
files.download(OUTPUT_PATH)


Saving 202501_clean2.xlsx to 202501_clean2 (1).xlsx
Saved: Tx-적합도.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>