In [72]:
import numpy as np
import pandas as pd
from scipy import stats

In [73]:
from google.colab import files
uploaded = files.upload()

INPUT_PATH = next(fn for fn in uploaded.keys() if fn.lower().endswith(('.xlsx', '.xls')))
print(f"[INFO] 사용 파일: {INPUT_PATH}")

Saving 202501_clean2.xlsx to 202501_clean2 (8).xlsx
[INFO] 사용 파일: 202501_clean2 (8).xlsx


In [74]:
# 대상 금속
TARGET_METALS = ['Cr(ng/m3)', 'Co(ng/m3)', 'Ni(ng/m3)',
                 'As(ng/m3)', 'Cd(ng/m3)', 'Sb(ng/m3)', 'Pb(ng/m3)']

# 분포 출력 순서
DIST_ORDER = ["로그 정규","감마","최대 극값","와이블","로지스틱","삼각형","스튜던트의 t",
              "정규","최소 극값","지수","파레토","균일","BetaPERT","베타"]

OUTPUT_PATH = "Tx-적합도.xlsx"

In [75]:
# 유틸/검정 함수
def get_distributions_extended():
    """요청 분포를 scipy.stats 객체로 매핑(일부는 별도 처리)"""
    return {"로그 정규":   stats.lognorm,
            "감마":       stats.gamma,
            "최대 극값":   stats.gumbel_r,
            "와이블":     stats.weibull_min,
            "로지스틱":    stats.logistic,
            "삼각형":     stats.triang,
            "스튜던트의 t": stats.t,
            "정규":       stats.norm,
            "최소 극값":   stats.gumbel_l,
            "지수":       stats.expon,
            "파레토":     stats.pareto,
            "균일":       stats.uniform,
            "베타":       stats.beta}

def select_data_for_dist(x: np.ndarray, name: str) -> np.ndarray:
    x = x[~np.isnan(x)]
    if name in ("로그 정규","감마","와이블","지수","파레토","베타","BetaPERT"):
        x = x[x > 0]
    return x

def fit_params(dist, x: np.ndarray, name: str):
    try:
        if name in ("로그 정규","감마","와이블","지수","파레토"):
            try:
                params = dist.fit(x, floc=0)  # loc=0 고정 (수렴 안정)
            except Exception:
                params = dist.fit(x)
        else:
            params = dist.fit(x)
        return params, None
    except Exception as e:
        return None, f"fit_error: {e}"

def ks_test(dist, params, x: np.ndarray):
    try:
        D, p = stats.kstest(x, dist.cdf, args=params)
        return float(D), float(p), None
    except Exception as e:
        return np.nan, np.nan, f"ks_error: {e}"

def chi_square_gof(dist, params, x: np.ndarray, min_expected=5, bins_min=5, bins_max=20):
    n = x.size
    if n < 10:
        return np.nan, np.nan, "chi_error: n<10"
    bins0 = int(np.clip(int(np.sqrt(n)), bins_min, bins_max))
    try:
        hist, edges = np.histogram(x, bins=bins0)
    except Exception as e:
        return np.nan, np.nan, f"chi_hist_error: {e}"

    cdfs = dist.cdf(edges, *params)
    expected = n * np.diff(cdfs)

    counts = hist.astype(float).tolist()
    expct  = expected.astype(float).tolist()
    edges_list = edges.tolist()

    def merge_bins(counts, expct, edges_list):
        i = 0
        while i < len(expct):
            if expct[i] < min_expected:
                if i == 0:
                    counts[i+1] += counts[i]; expct[i+1] += expct[i]
                    edges_list.pop(i+1); counts.pop(i); expct.pop(i)
                elif i == len(expct) - 1:
                    counts[i-1] += counts[i]; expct[i-1] += expct[i]
                    edges_list.pop(i); counts.pop(i); expct.pop(i); i -= 1
                else:
                    if expct[i-1] < expct[i+1]:
                        counts[i-1] += counts[i]; expct[i-1] += expct[i]
                        edges_list.pop(i); counts.pop(i); expct.pop(i); i -= 1
                    else:
                        counts[i+1] += counts[i]; expct[i+1] += expct[i]
                        edges_list.pop(i+1); counts.pop(i); expct.pop(i)
            else:
                i += 1
        return counts, expct, edges_list

    try:
        counts, expct, edges_list = merge_bins(counts, expct, edges_list)
        if any(e < min_expected for e in expct) or len(expct) < 2:
            return np.nan, np.nan, "chi_error: insufficient expected after merge"

        obs = np.array(counts, dtype=float)
        exp = np.array(expct, dtype=float)
        chi_stat = ((obs - exp) ** 2 / exp).sum()
        dof = max(len(exp) - 1 - (len(params)), 1)  # 자유도 근사
        pval = 1 - stats.chi2.cdf(chi_stat, dof)
        return float(chi_stat), float(pval), None
    except Exception as e:
        return np.nan, np.nan, f"chi_error: {e}"

def anderson_darling_stat(dist, params, x: np.ndarray):
    x = np.sort(x)
    n = x.size
    if n < 5:
        return np.nan, np.nan, "ad_error: n<5"
    F = dist.cdf(x, *params)
    F = np.clip(F, 1e-12, 1-1e-12)  # 수치 안정화
    i = np.arange(1, n+1)
    A2 = -n - np.mean((2*i - 1) * (np.log(F) + np.log(1 - F[::-1])))
    return float(A2), np.nan, None

# BetaPERT: (min, mode, max)을 Beta(loc=a, scale=b-a) 매핑
def betapert_alpha_beta(a, m, b, lamb=4.0):
    if not (a < m < b):
        return None, None
    alpha = 1.0 + lamb * (m - a) / (b - a)
    beta  = 1.0 + lamb * (b - m) / (b - a)
    if alpha <= 0 or beta <= 0:
        return None, None
    return alpha, beta

def fit_betapert(x: np.ndarray):
    a = float(np.min(x)); b = float(np.max(x))
    # 히스토그램 기반 모드 추정
    hist, edges = np.histogram(x, bins='auto')
    idx = int(np.argmax(hist))
    m = float(0.5 * (edges[idx] + edges[idx+1]))
    alpha, beta = betapert_alpha_beta(a, m, b, lamb=4.0)
    if alpha is None:
        return None, "betapert_error: invalid (a,m,b)"
    return (alpha, beta, a, (b - a)), None

In [76]:
# 매개변수 라벨링
import math

def _fmt(x):
    try:
        return f"{float(x):.6g}"
    except Exception:
        return str(x)

def describe_params(dname: str, params: tuple) -> str:
    if dname == "정규":
        loc, scale = params
        return f"평균={_fmt(loc)}, 표준편차={_fmt(scale)}"

    if dname == "로지스틱":
        loc, scale = params
        return f"중심(loc)={_fmt(loc)}, 스케일={_fmt(scale)}"

    if dname in ("최대 극값","최소 극값"):
        loc, scale = params
        return f"위치(loc)={_fmt(loc)}, 스케일={_fmt(scale)}"

    if dname == "로그 정규":
        s, loc, scale = params   # shape=s(=log-σ), scale=exp(μ)
        mu_log = math.log(scale) if scale > 0 else float("nan")
        sigma_log = s
        try:
            mean = math.exp(mu_log + sigma_log**2 / 2.0)
            std  = math.sqrt((math.exp(sigma_log**2) - 1.0) * math.exp(2*mu_log + sigma_log**2))
        except Exception:
            mean, std = float("nan"), float("nan")
        return (f"shape(s)={_fmt(s)}, loc={_fmt(loc)}, scale(exp(mu))={_fmt(scale)}, "
                f"log-μ={_fmt(mu_log)}, log-σ={_fmt(sigma_log)}, mean={_fmt(mean)}, std={_fmt(std)}")

    if dname == "와이블":
        c, loc, scale = params
        return f"형태(shape)={_fmt(c)}, 위치(loc)={_fmt(loc)}, 스케일={_fmt(scale)}"

    if dname == "감마":
        a, loc, scale = params
        return f"형태(k)={_fmt(a)}, 위치(loc)={_fmt(loc)}, 스케일(θ)={_fmt(scale)}"

    if dname == "지수":
        loc, scale = params
        lam = (1.0/scale) if scale not in (0, float('inf')) else float('nan')
        return f"최소(loc)={_fmt(loc)}, 스케일={_fmt(scale)}, 람다(1/scale)={_fmt(lam)}"

    if dname == "파레토":
        b, loc, scale = params
        xmin = loc + scale  # SciPy: support = [loc+scale, ∞)
        return f"형태(shape)={_fmt(b)}, loc={_fmt(loc)}, scale={_fmt(scale)}, 최소값(loc+scale)={_fmt(xmin)}"

    if dname == "균일":
        loc, scale = params
        a = loc; b = loc + scale
        return f"a(최소)={_fmt(a)}, b(최대)={_fmt(b)}"

    if dname == "스튜던트의 t":
        df, loc, scale = params
        return f"자유도(df)={_fmt(df)}, 위치(loc)={_fmt(loc)}, 스케일={_fmt(scale)}"

    if dname == "삼각형":
        c, loc, scale = params
        a = loc; b = loc + scale; mode = a + c*(b-a)
        return f"left={_fmt(a)}, right={_fmt(b)}, mode={_fmt(mode)} (c={_fmt(c)})"

    if dname in ("베타","BetaPERT"):
        a, b, loc, scale = params
        left = loc; right = loc + scale
        suffix = " (PERT)" if dname == "BetaPERT" else ""
        return f"alpha={_fmt(a)}, beta={_fmt(b)}, 구간=[{_fmt(left)}, {_fmt(right)}]{suffix}"

    # fallback
    return ", ".join([f"p{i}={_fmt(p)}" for i, p in enumerate(params, start=1)])

In [77]:
# 데이터 로드 & 컬럼 체크
df = pd.read_excel(INPUT_PATH)
missing = [m for m in TARGET_METALS if m not in df.columns]
if missing:
    raise ValueError(f"원본에 없는 컬럼: {missing}")

In [78]:
# 적합도 계산 루프
dist_map = get_distributions_extended()
records = []

for metal in TARGET_METALS:
    raw = pd.to_numeric(df[metal], errors='coerce').to_numpy()
    for dname in DIST_ORDER:
        x = select_data_for_dist(raw.copy(), dname)
        n = x.size
        if n < 5:
            records.append({"금속": metal, "분포": dname, "A-D": np.nan, "A-D P-값": np.nan,
                            "K-S": np.nan, "K-S P-값": np.nan, "카이제곱": np.nan, "카이제곱 P-값": np.nan,
                            "매개 변수": "표본수 부족(n<5)"})
            continue

        # 적합도
        if dname == "BetaPERT":
            params, fit_err = fit_betapert(x)
            dist = stats.beta
        else:
            dist = dist_map[dname]
            params, fit_err = fit_params(dist, x, dname)

        if fit_err or params is None:
            records.append({"금속": metal, "분포": dname, "A-D": np.nan, "A-D P-값": np.nan,
                            "K-S": np.nan, "K-S P-값": np.nan, "카이제곱": np.nan, "카이제곱 P-값": np.nan,
                            "매개 변수": fit_err or "fit_error"})
            continue

        # 적합도 지표
        ad_stat, ad_p, ad_err    = anderson_darling_stat(dist, params, x)
        ks_D, ks_p, ks_err       = ks_test(dist, params, x)
        chi_stat, chi_p, chi_err = chi_square_gof(dist, params, x)

        # 파라미터 표기
        param_str = describe_params(dname, params)

        notes = []
        if ad_err:  notes.append(ad_err)
        if ks_err:  notes.append(ks_err)
        if chi_err: notes.append(chi_err)
        if notes:
            param_str = f"{param_str}  [{' | '.join(notes)}]"

        records.append({"금속": metal,
                        "분포": dname,
                        "A-D": ad_stat,
                        "A-D P-값": ad_p,      # 기본 NaN
                        "K-S": ks_D,
                        "K-S P-값": ks_p,
                        "카이제곱": chi_stat,
                        "카이제곱 P-값": chi_p,
                        "매개 변수": param_str})

  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)


In [79]:
# 엑셀 저장
out_df = pd.DataFrame(records, columns=["금속","분포","A-D","A-D P-값","K-S","K-S P-값","카이제곱","카이제곱 P-값","매개 변수"])
# 분포 요청 순서 유지
out_df["분포"] = pd.Categorical(out_df["분포"], categories=DIST_ORDER, ordered=True)
out_df = out_df.sort_values(["금속","분포"]).reset_index(drop=True)

with pd.ExcelWriter(OUTPUT_PATH, engine="openpyxl") as writer:
    out_df.to_excel(writer, index=False, sheet_name="적합도 요약")

print(f"[완료] 저장: {OUTPUT_PATH}")

# 엑셀 저장
files.download(OUTPUT_PATH)

[완료] 저장: Tx-적합도.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>