In [69]:
import os, math, shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from google.colab import files

uploaded = files.upload()

Saving 202501_clean2.xlsx to 202501_clean2 (3).xlsx


In [70]:
INPUT_XLSX  = "202501_clean2.xlsx"
REPORT_XLSX = "적합도.xlsx"

df = pd.read_excel(INPUT_XLSX)
df.columns = df.columns.str.strip()

# Conc + 금속7개
target_vars = ["Conc(ug/m3)",
               "As(ng/m3)", "Cd(ng/m3)", "Co(ng/m3)",
               "Cr(ng/m3)", "Ni(ng/m3)", "Pb(ng/m3)", "Sb(ng/m3)"]

df[target_vars] = df[target_vars].apply(pd.to_numeric, errors="coerce")

In [71]:
# 분포 정의
CANDIDATES = [("최대 극값",   stats.gumbel_r,   "real_nonneg"),
              ("로그 정규",    stats.lognorm,    "positive"),
              ("와이블",      stats.weibull_min,"positive"),
              ("로지스틱",    stats.logistic,   "real_nonneg"),
              ("스튜던트 t",  stats.t,          "real_nonneg"),
              ("정규",        stats.norm,       "real_nonneg"),
              ("감마",        stats.gamma,      "positive"),
              ("베타",        stats.beta,       "beta01"),
              ("삼각형",      stats.triang,     "real_nonneg"),
              ("최소 극값",   stats.gumbel_l,   "real_nonneg"),
              ("지수",        stats.expon,      "positive"),
              ("파레토",      stats.pareto,     "positive"),
              ("BetaPERT",    None,             "pert"),
              ("균일",        stats.uniform,    "real_nonneg")]

In [72]:
# 함수 정의
def select_data(series, support):
    x = pd.to_numeric(series, errors="coerce").dropna()
    if support=="positive": x = x[x>0]
    if support=="real_nonneg": x = x[x>=0]
    if support=="beta01":
        a, b = x.min(), x.max()
        if b>a: x = (x-a)/(b-a)
        else: return np.array([])
    if support=="pert": x = x[x>0]
    return x.to_numpy()

def fit_betaPERT(x):
    a, b, m = np.min(x), np.max(x), np.median(x)
    alpha = 1+4*(m-a)/(b-a); beta=1+4*(b-m)/(b-a)
    params=(alpha,beta,a,b-a)
    ll=np.sum(stats.beta.logpdf(x,alpha,beta,loc=a,scale=b-a))
    return params,ll

def safe_fit(dist, x, dname):
    try:
        if dname=="베타":
            params=dist.fit(x,floc=0,fscale=1)
        else:
            params=dist.fit(x)
        ll=np.sum(dist.logpdf(x,*params))
        return params,ll
    except: return None,None

def ks_test(dist,x,params):
    try:
        D,p=stats.kstest(x, lambda t: dist.cdf(t,*params))
        return D,p
    except: return np.nan,np.nan

def anderson_stat(x, dname):
    name_map={"정규":"norm","로지스틱":"logistic",
              "최대 극값":"gumbel","최소 극값":"gumbel_l","지수":"expon"}
    if dname not in name_map: return np.nan,"---"
    res=stats.anderson(x,dist=name_map[dname])
    return res.statistic,"---"

def chisq_gof(x,dist,params):
    hist,edges=np.histogram(x,bins="doane")
    cdf=dist.cdf(edges,*params); exp=np.diff(cdf)*len(x)
    exp[exp<1e-8]=1e-8
    chi2=np.sum((hist-exp)**2/exp)
    dof=len(hist)-1-len(params)
    p=1-stats.chi2.cdf(chi2,dof) if dof>0 else np.nan
    return chi2,p

def fmt(v):
    return "NaN" if v is None or not np.isfinite(v) else f"{v:.5f}"

def param_string(name,row):
    s1,s2,s3,loc,scale=[row.get(k,np.nan) for k in ["shape1","shape2","shape3","loc","scale"]]
    if name=="최대 극값": return f"위치={fmt(loc)}, 스케일={fmt(scale)}"
    if name=="로그 정규": return f"위치={fmt(loc)}, 스케일={fmt(scale)}, 형태={fmt(s1)}"
    if name=="와이블": return f"위치={fmt(loc)}, 스케일={fmt(scale)}, 형태={fmt(s1)}"
    if name=="로지스틱": return f"평균={fmt(loc)}, 스케일={fmt(scale)}"
    if name=="스튜던트의 t": return f"중간값={fmt(loc)}, 스케일={fmt(scale)}, 자유도={fmt(s1)}"
    if name=="정규": return f"평균={fmt(loc)}, 표준편차={fmt(scale)}"
    if name=="감마": return f"위치={fmt(loc)}, 스케일={fmt(scale)}, 형태={fmt(s1)}"
    if name=="베타": return f"알파={fmt(s1)}, 베타={fmt(s2)}, loc={fmt(loc)}, scale={fmt(scale)}"
    if name=="삼각형": return f"모양={fmt(s1)}, 최소={fmt(loc)}, 최대={fmt(loc+scale)}"
    if name=="최소 극값": return f"위치={fmt(loc)}, 스케일={fmt(scale)}"
    if name=="지수": return f"위치={fmt(loc)}, 스케일={fmt(scale)}"
    if name=="파레토": return f"최소={fmt(loc)}, 형태(α)={fmt(s1)}, 스케일={fmt(scale)}"
    if name=="BetaPERT": return f"최소={fmt(loc)}, 최대={fmt(loc+scale)}, 알파={fmt(s1)}, 베타={fmt(s2)}"
    if name=="균일": return f"최소={fmt(loc)}, 최대={fmt(loc+scale)}"
    return f"loc={fmt(loc)}, scale={fmt(scale)}"

In [76]:
# 실행
fits_all = []

for col in target_vars:
    for dname, dist, support in CANDIDATES:
        x = select_data(df[col], support)
        if len(x) < 5:
            continue

        if dname == "BetaPERT":
            params, ll = fit_betaPERT(x)
            if params is None:
                continue
            dist_use = stats.beta
        else:
            params, ll = safe_fit(dist, x, dname)
            if params is None:
                continue
            dist_use = dist

        # 파라미터 분해
        shapes = params[:-2]
        loc = params[-2]
        scale = params[-1]

        rec = {"분포": dname,
               "shape1": shapes[0] if len(shapes) > 0 else np.nan,
               "shape2": shapes[1] if len(shapes) > 1 else np.nan,
               "shape3": shapes[2] if len(shapes) > 2 else np.nan,
               "loc": loc,
               "scale": scale}

        # 지표 계산
        A_D, A_Dp = anderson_stat(x, dname)
        K_S, K_Sp = ks_test(dist_use, x, params)
        ChiSq, ChiSq_p = chisq_gof(x, dist_use, params)

        rec.update({"A-D": A_D,
                    "A-D P-값": A_Dp,
                    "K-S": K_S,
                    "K-S P-값": K_Sp,
                    "카이제곱": ChiSq,
                    "카이제곱 P-값": ChiSq_p,
                    "매개 변수": param_string(dname, rec),
                    "variable": col})

        fits_all.append(rec)

fits_df = pd.DataFrame(fits_all)

  tmp = (xj - a) / b
  tmp = (xj - a) / b
 improvement from the last ten iterations.
  sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
  w = (y - sol[0]) / sol[1]
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  w = (y - xbar) / s
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)


In [78]:
# 엑셀 저장
def safe_sheetname(name: str) -> str:
    bad = ['\\','/','?','*','[',']',':']
    for b in bad:
        name = name.replace(b, "_")
    return name[:28]

with pd.ExcelWriter(REPORT_XLSX, engine="openpyxl") as writer:
    for v in target_vars:
        sub = fits_df[fits_df["variable"]==v][["분포","A-D","A-D P-값","K-S","K-S P-값","카이제곱","카이제곱 P-값","매개 변수"]]
        sub.to_excel(writer, index=False, sheet_name=safe_sheetname(v))

print(f"[완료] {REPORT_XLSX} 저장됨")


# 엑셀 다운

files.download(REPORT_XLSX)

[완료] 적합도.xlsx 저장됨


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>