In [None]:
# 전처리 시작

In [11]:
# ==============================================
# Step 0. Colab 업로드
# ==============================================
from google.colab import files
import os
import pandas as pd
import numpy as np

print("'202501_clean2.xlsx' 업로드 해주세요")
uploaded = files.upload()
print("현재 디렉토리:", os.listdir())

# ==============================================
# Step 1. 데이터 로드
# ==============================================
INPUT_XLSX = "202501_clean2.xlsx"  # 업로드된 파일명 맞춰주세요
raw = pd.read_excel(INPUT_XLSX)

print("원본데이터 shape:", raw.shape)

# ==============================================
# Step 2. 금속 컬럼 탐색
# ==============================================
import re

patterns = {
    "Cr": r"(?i)\bAs\b|Arsenic",
    "Co": r"(?i)\bCd\b|Cadmium",
    "Ni": r"(?i)\bCr\b(?!\(VI\))",
    "As": r"(?i)\bNi\b|Nickel",
    "Cd": r"(?i)\bPb\b|Lead",
    "Sb": r"(?i)\bSb\b|Antimony",
    "Pb": r"(?i)\bCo\b|Cobalt"}

metal_map = {}
for metal, pat in patterns.items():
    cand = [c for c in raw.columns if re.search(pat, str(c))]
    if cand:
        metal_map[metal] = cand[0]

print("인식된 금속 컬럼:", metal_map)

# ==============================================
# Step 3. 단위 변환 (ng/m³ → µg/m³)
# ==============================================
for m, col in metal_map.items():
    raw[col] = pd.to_numeric(raw[col], errors="coerce") / 1000.0

print("단위 변환 완료 (ng → µg)")

# ==============================================
# Step 4. Cr(VI) 생성
# ==============================================
if "Cr" in metal_map:
    raw["Cr(VI)"] = raw[metal_map["Cr"]] / 7.0
    metal_map["Cr(VI)"] = "Cr(VI)"
    print("Cr(VI) 컬럼 생성 완료")

# ==============================================
# Step 5. 결측치 처리
# ==============================================
raw = raw.dropna(how="all")  # 행 전체가 NaN이면 제거
print("결측치 제거 후 shape:", raw.shape)

# ==============================================
# Step 6. 극단값 제거 (5–95% 트리밍)
# ==============================================
trimmed = {}
for m, col in metal_map.items():
    series = raw[col].dropna()
    ql, qh = series.quantile(0.05), series.quantile(0.95)
    trimmed[m] = series[(series >= ql) & (series <= qh)]

# ==============================================
# Step 7. 트리밍 결과 저장
# ==============================================
OUTDIR = "preprocessed"; os.makedirs(OUTDIR, exist_ok=True)

with pd.ExcelWriter(os.path.join(OUTDIR, "preprocessed_data.xlsx")) as writer:
    for m, s in trimmed.items():
        s.to_excel(writer, sheet_name=m, index=False)

print("전처리 완료: preprocessed/preprocessed_data.xlsx 저장됨")


📂 '202501_clean2.xlsx' 업로드 해주세요


Saving 202501_clean2.xlsx to 202501_clean2 (3).xlsx
현재 디렉토리: ['.config', 'preprocessed', '202501_clean2 (2).xlsx', 'mc_outputs', '202501_clean2 (3).xlsx', '202501_clean2 (1).xlsx', '202501_clean2.xlsx', 'sample_data']
✅ 원자료 shape: (461, 11)
✅ 인식된 금속 컬럼: {'As': 'As(ng/m3)', 'Cd': 'Cd(ng/m3)', 'Cr': 'Cr(ng/m3)', 'Ni': 'Ni(ng/m3)', 'Pb': 'Pb(ng/m3)', 'Sb': 'Sb(ng/m3)', 'Co': 'Co(ng/m3)'}
✅ 단위 변환 완료 (ng → µg)
✅ Cr(VI) 컬럼 생성 완료
✅ 결측치 제거 후 shape: (461, 12)
✅ 전처리 완료: preprocessed/preprocessed_data.xlsx 저장됨


In [None]:
# 최적 분포 피팅

In [15]:
import pandas as pd
import numpy as np
from scipy import stats
import os

# ============================================
# 후보 분포 정의 (확장판)
# ============================================
CANDIDATES = {
    "정규": stats.norm,
    "로그정규": stats.lognorm,
    "스튜던트t": stats.t,
    "로지스틱": stats.logistic,
    "지수": stats.expon,
    "감마": stats.gamma,
    "와이블": stats.weibull_min,
    "최대극값": stats.gumbel_r,
    "최소극값": stats.gumbel_l,
    "균일": stats.uniform,
    "삼각형": stats.triang,
    "베타": stats.beta,
    "파레토": stats.pareto,
}

# ============================================
# Anderson-Darling 통계량 & p-값 근사
# ============================================
def anderson_darling_stat(data, dist, params):
    """일반 분포용 Anderson-Darling 통계량"""
    data = np.sort(np.asarray(data))
    n = len(data)
    cdf_vals = dist.cdf(data, *params)
    cdf_vals = np.clip(cdf_vals, 1e-10, 1-1e-10)  # 안정화
    i = np.arange(1, n+1)
    s = np.sum((2*i-1) * (np.log(cdf_vals) + np.log(1-cdf_vals[::-1])))
    A2 = -n - s/n
    return A2

def anderson_darling_pvalue(A2):
    """A-D p-값 근사 (정규 기준, 다른 분포는 참고용)"""
    try:
        if A2 < 0.2:
            p = 1 - np.exp(-13.436 + 101.14*A2 - 223.73*A2**2)
        elif A2 < 0.34:
            p = 1 - np.exp(-8.318 + 42.796*A2 - 59.938*A2**2)
        elif A2 < 0.6:
            p = np.exp(0.9177 - 4.279*A2 - 1.38*A2**2)
        else:
            p = np.exp(1.2937 - 5.709*A2 + 0.0186*A2**2)
        return min(max(p,0),1)  # 0~1 범위 제한
    except Exception:
        return None

# ============================================
# 분포 피팅 & 지표 계산
# ============================================
def fit_and_test(series, dist, name):
    data = np.asarray(series.dropna())
    data = data[data > 0]
    try:
        params = dist.fit(data)

        # AIC
        pdf = dist.pdf(data, *params)
        pdf = np.clip(pdf, 1e-12, None)
        ll = np.sum(np.log(pdf))
        k = len(params)
        aic = 2*k - 2*ll

        # KS
        cdf = lambda x: dist.cdf(x, *params)
        ks_stat, ks_p = stats.kstest(data, cdf)

        # Chi-square
        bins = np.histogram_bin_edges(data, bins="auto")
        obs, _ = np.histogram(data, bins)
        exp = np.diff(dist.cdf(bins, *params)) * len(data)
        chi2 = np.sum((obs-exp)**2 / (exp+1e-9))
        chi2_p = 1 - stats.chi2.cdf(chi2, df=len(obs)-len(params)-1)

        # Anderson-Darling
        ad_stat = anderson_darling_stat(data, dist, params)
        ad_p = anderson_darling_pvalue(ad_stat)
        if ad_p is None:
            ad_p = "--"

        return {
            "dist": name,
            "aic": aic,
            "ks": ks_stat, "ks_p": ks_p,
            "chi2": chi2, "chi2_p": chi2_p,
            "ad": ad_stat, "ad_p": ad_p,
            "params": params
        }
    except Exception:
        return None

def best_fit(series):
    results = []
    for name, dist in CANDIDATES.items():
        res = fit_and_test(series, dist, name)
        if res:
            results.append(res)
    df = pd.DataFrame(results)
    if df.empty:
        return None

    # A-D P-값 숫자로 변환 ("--" → NaN)
    def parse_adp(v):
        try:
            return float(v)
        except:
            return np.nan
    df["ad_p_num"] = df["ad_p"].apply(parse_adp)

    # 1) A-D p값 >= 0.05 후보 우선
    valid = df[df["ad_p_num"] >= 0.05].copy()
    if valid.empty:
        valid = df.copy()  # 없으면 전체 후보 사용

    # 2) 정렬 기준: A-D 최소 → KS p 최대 → χ² p 최대 → 파라미터 수 최소
    valid["_key"] = list(zip(
        valid["ad"].apply(lambda v: v if np.isfinite(v) else np.inf),
        valid["ks_p"].apply(lambda v: -v if np.isfinite(v) else np.inf),
        valid["chi2_p"].apply(lambda v: -v if np.isfinite(v) else np.inf),
        valid["params"].apply(lambda p: len(p) if p is not None else np.inf)
    ))

    best = valid.sort_values("_key", kind="mergesort").iloc[0]
    return best

# ============================================
# 금속별 데이터에서 피팅 수행
# ============================================
summary_rows = {
    "분포": {}, "최선 적합(AIC)": {}, "앤더슨-달링": {}, "A-D P-값": {},
    "K-S": {}, "K-S P-값": {}, "카이제곱": {}, "카이제곱 P-값": {}, "매개변수": {}
}

for metal, series in trimmed.items():  # ← 전처리 단계에서 만든 trimmed 데이터 사용
    best = best_fit(series)
    if best is None:
        continue
    summary_rows["분포"][metal] = best["dist"]
    summary_rows["최선 적합(AIC)"][metal] = round(best["aic"],4)
    summary_rows["앤더슨-달링"][metal] = round(best["ad"],4)
    summary_rows["A-D P-값"][metal] = best["ad_p"]
    summary_rows["K-S"][metal] = round(best["ks"],4)
    summary_rows["K-S P-값"][metal] = round(best["ks_p"],3)
    summary_rows["카이제곱"][metal] = round(best["chi2"],4)
    summary_rows["카이제곱 P-값"][metal] = round(best["chi2_p"],3)
    summary_rows["매개변수"][metal] = ", ".join([f"{p:.3f}" for p in best["params"]])

# ============================================
# 엑셀 저장 (행=지표, 열=금속)
# ============================================
report_df = pd.DataFrame(summary_rows).T.reset_index()
report_df.rename(columns={"index": "데이터 계열"}, inplace=True)

os.makedirs("fit_outputs", exist_ok=True)
report_df.to_excel("fit_outputs/Fit_Report_wide.xlsx", index=False)

print("Fit_Report_wide.xlsx 생성 완료 (금속별 열, 지표별 행 포맷)")


✅ Fit_Report_wide.xlsx 생성 완료 (금속별 열, 지표별 행 포맷)


In [16]:
# 분포 선택 결과만 정리해서 미리 출력
check_df = pd.DataFrame([
    {
        "금속": metal,
        "선택된 분포": best["dist"],
        "A-D": round(best["ad"], 4),
        "A-D p값": best["ad_p"],
        "K-S": round(best["ks"], 4),
        "K-S p값": round(best["ks_p"], 4),
        "카이제곱": round(best["chi2"], 4),
        "카이제곱 p값": round(best["chi2_p"], 4),
        "매개변수": ", ".join([f"{p:.3f}" for p in best["params"]])
    }
    for metal, best in {
        m: best_fit(series) for m, series in trimmed.items()
    }.items() if best is not None
])

# 코랩에서 바로 확인
check_df

Unnamed: 0,금속,선택된 분포,A-D,A-D p값,K-S,K-S p값,카이제곱,카이제곱 p값,매개변수
0,As,정규,,,,,,,"nan, nan"
1,Cd,베타,0.5534,0.153692,0.0364,0.6445,7.5158,0.185,"0.952, 1.219, 0.144, 0.158"
2,Cr,베타,0.7968,0.039039,0.0451,0.5243,7.0145,0.2196,"1.300, 1.280, 0.000, 0.005"
3,Ni,베타,0.6715,0.079525,0.0398,0.7296,8.7574,0.1191,"0.983, 1.348, 0.000, 0.006"
4,Pb,와이블,0.7857,0.041568,0.037,0.7,62.7114,0.0,"1.114, 0.000, 0.008"
5,Sb,베타,1.0263,0.010613,0.0517,0.6157,6.3121,0.177,"0.768, 1.266, 0.001, 0.041"
6,Co,베타,2.6095,1e-06,0.0742,0.0225,22.0302,0.0005,"0.886, 1.302, 0.002, 0.012"
7,Cr(VI),베타,0.7968,0.039039,0.0451,0.5243,6.5899,0.253,"1.300, 1.280, 0.000, 0.001"


In [None]:
# 난수 만개 생성

In [None]:
# LADD/LECR 계산

In [None]:
# 최종 Cumulative LECR 계산