In [27]:
from google.colab import files
uploaded = files.upload()

import numpy as np, pandas as pd, json, re, os
from scipy import stats
from scipy.stats import (gumbel_r, gumbel_l, lognorm, weibull_min, logistic, t,
                         norm, gamma, beta, triang, expon, pareto, uniform, chi2)
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.styles import Font, PatternFill

Saving 202501_clean2.xlsx to 202501_clean2 (2).xlsx


In [28]:
# -------- 경로 설정 --------
def _pick_xlsx(uploaded_dict):
    for k in uploaded_dict.keys():
        if str(k).lower().endswith(('.xlsx', '.xls')):
            return k
    raise FileNotFoundError("업로드된 엑셀 파일(.xlsx/.xls)을 찾지 못했습니다.")
DATA_PATH = _pick_xlsx(uploaded)
OUTPUT_PATH = 'Tx-적합도.xlsx'

np.random.seed(42)
BOOTSTRAP_B = 300            # 부트스트랩 반복(속도 필요시 120~200 권장)
GROUP_GAP = 1                # 금속 블록 사이 공백 행 수
HEADER_BLUE = '2F5597'       # 헤더 색상
BEST_FILL = 'FFF2CC'         # 1순위 행 하이라이트

In [29]:
# ---------- 유틸 ----------
def detect_numeric_metal_columns(df):
    exclude_tokens = {'date','time','pump','begin','start','end','id','index','sample','unit',
                      'yyyymm','yyyy','mm','dd','hour','minute','sec'}
    cols = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            name = str(c).lower()
            if not any(tok in name for tok in exclude_tokens):
                cols.append(c)
    return cols

def to_numpy_clean(x):
    return pd.Series(x, dtype='float').replace([np.inf, -np.inf], np.nan).dropna().values

def hist_mode_estimate(x):
    x = np.asarray(x); n = len(x)
    if n < 2: return float(np.median(x))
    iqr = np.subtract(*np.percentile(x,[75,25]))
    bins = int(np.ceil(np.sqrt(n))) if iqr <= 0 else max(10, int(np.ceil((x.max()-x.min())/(2*iqr*n**(-1/3)))))
    cnt, edges = np.histogram(x, bins=bins); i = cnt.argmax()
    return float((edges[i] + edges[i+1]) / 2)

# ---------- 분포 래퍼 ----------
class W:
    def __init__(self, name): self.name=name; self.p={}; self.np=None; self.valid=True
    def _fail(self): self.valid=False; return self

class GumbelR(W):
    def __init__(self): super().__init__('최대 극값')
    def fit(self,x):
        try: loc,sc=gumbel_r.fit(x); assert np.isfinite(loc) and sc>0
        except: return self._fail()
        self.p=dict(loc=loc,scale=sc); self.np=2; return self
    def cdf(self,z): return gumbel_r.cdf(z,**self.p)
    def ppf(self,q): return gumbel_r.ppf(q,**self.p)
    def rvs(self,n): return gumbel_r.rvs(size=n,**self.p, random_state=42)

class LogNormal(W):
    def __init__(self): super().__init__('로그 정규')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: s,loc,sc=lognorm.fit(x, floc=0); assert s>0 and sc>0
        except: return self._fail()
        self.p=dict(shape=s,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return lognorm.cdf(z, s=self.p['shape'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return lognorm.ppf(q, s=self.p['shape'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return lognorm.rvs(s=self.p['shape'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Weibull(W):
    def __init__(self): super().__init__('와이블')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: c,loc,sc=weibull_min.fit(x, floc=0); assert c>0 and sc>0
        except: return self._fail()
        self.p=dict(c=c,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return weibull_min.cdf(z, c=self.p['c'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return weibull_min.ppf(q, c=self.p['c'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return weibull_min.rvs(c=self.p['c'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Logistic(W):
    def __init__(self): super().__init__('로지스틱')
    def fit(self,x):
        try: loc,sc=logistic.fit(x); assert sc>0
        except: return self._fail()
        self.p=dict(loc=loc,scale=sc); self.np=2; return self
    def cdf(self,z): return logistic.cdf(z,**self.p)
    def ppf(self,q): return logistic.ppf(q,**self.p)
    def rvs(self,n): return logistic.rvs(size=n,**self.p, random_state=42)

class StudentT(W):
    def __init__(self): super().__init__('스튜던트의 t')
    def fit(self,x):
        x = np.asarray(x, float); x = x[np.isfinite(x)]
        if x.size < 5 or np.allclose(x.min(), x.max()): return self._fail()
        # 1) 기본
        try:
            df_,loc,sc = t.fit(x)
            if not(np.isfinite(df_*loc*sc)) or sc<=0 or df_<=0: raise RuntimeError
        except Exception:
            # 2) 위치 고정(중앙값)
            try:
                df_,loc,sc = t.fit(x, floc=float(np.median(x)))
                if not(np.isfinite(df_*loc*sc)) or sc<=0 or df_<=0: raise RuntimeError
            except Exception:
                # 3) 자유도 고정
                try:
                    df_fixed = 30.0
                    _df,loc,sc = t.fit(x, fdf=df_fixed)
                    df_ = df_fixed
                    if not np.isfinite(loc*sc) or sc<=0: return self._fail()
                except Exception:
                    return self._fail()
        self.p=dict(df=df_,loc=loc,scale=sc); self.np=3; return self
    def cdf(self,z): return t.cdf(z, **self.p)
    def ppf(self,q): return t.ppf(q, **self.p)
    def rvs(self,n): return t.rvs(size=n, **self.p, random_state=42)

class Normal(W):
    def __init__(self): super().__init__('정규')
    def fit(self,x):
        try: mu,sig=norm.fit(x); assert sig>0
        except: return self._fail()
        self.p=dict(loc=mu,scale=sig); self.np=2; return self
    def cdf(self,z): return norm.cdf(z,**self.p)
    def ppf(self,q): return norm.ppf(q,**self.p)
    def rvs(self,n): return norm.rvs(size=n,**self.p, random_state=42)

class Gamma_(W):
    def __init__(self): super().__init__('감마')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: a,loc,sc=gamma.fit(x, floc=0); assert a>0 and sc>0
        except: return self._fail()
        self.p=dict(a=a,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return gamma.cdf(z, a=self.p['a'], loc=0, scale=self.p['scale'])
    def ppf(self,q): return gamma.ppf(q, a=self.p['a'], loc=0, scale=self.p['scale'])
    def rvs(self,n): return gamma.rvs(a=self.p['a'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class Beta_(W):
    def __init__(self): super().__init__('베타')
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        z = np.clip((x-a)/(b-a), 1e-9, 1-1e-9)
        try: al,be,_,_ = beta.fit(z, floc=0, fscale=1); assert al>0 and be>0
        except: return self._fail()
        self.p=dict(alpha=al,beta=be,a=a,b=b); self.np=2; return self
    def _z(self,z): return (z-self.p['a'])/(self.p['b']-self.p['a'])
    def cdf(self,z): return beta.cdf(self._z(z), self.p['alpha'], self.p['beta'])
    def ppf(self,q): return self.p['a']+(self.p['b']-self.p['a'])*beta.ppf(q, self.p['alpha'], self.p['beta'])
    def rvs(self,n):
        r=beta.rvs(self.p['alpha'], self.p['beta'], size=n, random_state=42)
        return self.p['a']+(self.p['b']-self.p['a'])*r

class Triangular_(W):
    def __init__(self): super().__init__('삼각형')
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        m = float(np.clip(hist_mode_estimate(x), a+1e-9, b-1e-9))
        c = (m-a)/(b-a)
        if not(0<c<1): return self._fail()
        self.p=dict(a=a,m=m,b=b,c=c); self.np=3; return self
    def cdf(self,z): return triang.cdf(z, c=self.p['c'], loc=self.p['a'], scale=(self.p['b']-self.p['a']))
    def ppf(self,q): return triang.ppf(q, c=self.p['c'], loc=self.p['a'], scale=(self.p['b']-self.p['a']))
    def rvs(self,n): return triang.rvs(c=self.p['c'], loc=self.p['a'], scale=(self.p['b']-self.p['a']), size=n, random_state=42)

class GumbelL(W):
    def __init__(self): super().__init__('최소 극값')
    def fit(self,x):
        try: loc,sc=gumbel_l.fit(x); assert sc>0
        except: return self._fail()
        self.p=dict(loc=loc,scale=sc); self.np=2; return self
    def cdf(self,z): return gumbel_l.cdf(z,**self.p)
    def ppf(self,q): return gumbel_l.ppf(q,**self.p)
    def rvs(self,n): return gumbel_l.rvs(size=n,**self.p, random_state=42)

class Exponential_(W):
    def __init__(self): super().__init__('지수')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: loc,sc=expon.fit(x, floc=0); assert sc>0
        except: return self._fail()
        self.p=dict(loc=0.0,scale=sc); self.np=1; return self
    def cdf(self,z): return expon.cdf(z,**self.p)
    def ppf(self,q): return expon.ppf(q,**self.p)
    def rvs(self,n): return expon.rvs(size=n,**self.p, random_state=42)

class Pareto_(W):
    def __init__(self): super().__init__('파레토')
    def fit(self,x):
        x=x[x>0];
        if len(x)<10: return self._fail()
        try: b,loc,sc=pareto.fit(x, floc=0); assert b>0 and sc>0
        except: return self._fail()
        self.p=dict(b=b,loc=0.0,scale=sc); self.np=2; return self
    def cdf(self,z): return pareto.cdf(z,**self.p)
    def ppf(self,q): return pareto.ppf(q,**self.p)
    def rvs(self,n): return pareto.rvs(self.p['b'], loc=0, scale=self.p['scale'], size=n, random_state=42)

class BetaPERT_(W):
    def __init__(self,lam=4.0): super().__init__('BetaPERT'); self.lam=lam
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        m = float(np.clip(hist_mode_estimate(x), a+1e-9, b-1e-9))
        al = 1 + self.lam*(m-a)/(b-a); be = 1 + self.lam*(b-m)/(b-a)
        if not(al>0 and be>0): return self._fail()
        self.p=dict(alpha=al,beta=be,a=a,b=b,m=m,lam=self.lam); self.np=2; return self
    def cdf(self,z):
        z0=(z-self.p['a'])/(self.p['b']-self.p['a'])
        return beta.cdf(z0, self.p['alpha'], self.p['beta'])
    def ppf(self,q):
        return self.p['a']+(self.p['b']-self.p['a'])*beta.ppf(q, self.p['alpha'], self.p['beta'])
    def rvs(self,n):
        r=beta.rvs(self.p['alpha'], self.p['beta'], size=n, random_state=42)
        return self.p['a']+(self.p['b']-self.p['a'])*r

class Uniform_(W):
    def __init__(self): super().__init__('균일')
    def fit(self,x):
        a,b = float(np.min(x)), float(np.max(x))
        if not(np.isfinite(a) and np.isfinite(b) and b>a): return self._fail()
        self.p=dict(loc=a,scale=(b-a)); self.np=2; return self
    def cdf(self,z): return uniform.cdf(z,**self.p)
    def ppf(self,q): return uniform.ppf(q,**self.p)
    def rvs(self,n): return uniform.rvs(size=n,**self.p, random_state=42)


In [30]:
# ---------- 지표 ----------
def AD_stat(x, cdf, eps=1e-12):
    x=np.sort(x); n=len(x); u=np.clip(cdf(x),eps,1-eps); i=np.arange(1,n+1)
    return float(-n - np.sum((2*i-1)*(np.log(u)+np.log(1-u[::-1])))/n)

def AD_p_boot(x, dist, A2, B=BOOTSTRAP_B):
    n=len(x)
    if n<5 or not dist.valid: return np.nan
    cnt=0
    for _ in range(B):
        xs=dist.rvs(n); A2s=AD_stat(xs, dist.cdf)
        if A2s>=A2: cnt+=1
    return float(cnt/B)

def KS_stat_p(x, dist):
    try:
        D,p = stats.kstest(x, lambda z: dist.cdf(z))
        return float(D), float(p)
    except Exception:
        return np.nan, np.nan

def Chi2_stat_p(x, dist):
    try:
        n=len(x); N=max(5, min(50, n//5)); eps=1e-6
        qs=np.linspace(eps, 1-eps, N+1); edges=np.unique(dist.ppf(qs))
        if len(edges) < 3: return np.nan, np.nan
        obs,_ = np.histogram(x, bins=edges); exp = np.diff(qs)*n
        k = dist.np or 0; df = len(obs)-1-k
        if df <= 0: return np.nan, np.nan
        exp = np.maximum(exp[:len(obs)], 1e-9)
        chi = np.sum((obs-exp)**2/exp); p = 1.0 - chi2.cdf(chi, df)
        return float(chi), float(p)
    except Exception:
        return np.nan, np.nan

# ---------- 파라미터 문자열 ----------
def pstr(name,p):
    try:
        if name=='로그 정규': return f"형태={p['shape']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='와이블':   return f"형태={p['c']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='감마':     return f"형태={p['a']:.5g}, 스케일={p['scale']:.5g}, 위치=0"
        if name=='지수':     return f"비율={1.0/p['scale']:.5g}"
        if name in ['최대 극값','최소 극값','정규','로지스틱']:
            lab='최고가능성' if '극값' in name else '평균'
            return f"{lab}={p['loc']:.5g}, 스케일={p['scale']:.5g}"
        if name=='스튜던트의 t': return f"중간점={p['loc']:.5g}, 스케일={p['scale']:.5g}, 자유도={p['df']:.5g}"
        if name=='파레토':   return f"위치=0, 스케일={p['scale']:.5g}, 형태={p['b']:.5g}"
        if name=='베타':     return f"최소={p['a']:.5g}, 최대={p['b']:.5g}, 알파={p['alpha']:.5g}, 베타={p['beta']:.5g}"
        if name=='BetaPERT': return f"최소={p['a']:.5g}, 최고가능성={p['m']:.5g}, 최대={p['b']:.5g}, α={p['alpha']:.5g}, β={p['beta']:.5g}"
        if name=='삼각형':   return f"최소={p['a']:.5g}, 최고가능성={p['m']:.5g}, 최대={p['b']:.5g}"
        if name=='균일':     return f"최소={p['loc']:.5g}, 최대={(p['loc']+p['scale']):.5g}"
        return json.dumps(p, ensure_ascii=False)
    except Exception:
        return json.dumps(p, ensure_ascii=False)

In [31]:
# ---------- 적합 & 정렬 ----------
POSITIVE = {'로그 정규','와이블','감마','지수','파레토'}

def fit_one_col(x_raw):
    x = to_numpy_clean(x_raw)
    if len(x) < 20:
        return pd.DataFrame(columns=['분포','A-D','A-D P-값','K-S','K-S P-값','카이제곱','카이제곱 P-값','매개 변수',
                                     '_AD','_KS_p','_Chi2_p','_n_params'])
    rows=[]
    for d in [GumbelR(), LogNormal(), Weibull(), Logistic(), StudentT(), Normal(),
              Gamma_(), Beta_(), Triangular_(), GumbelL(), Exponential_(),
              Pareto_(), BetaPERT_(), Uniform_()]:
        d = d.fit(x)
        if not d or not d.valid:
            rows.append({'분포':d.name,'A-D':np.nan,'A-D P-값':np.nan,'K-S':np.nan,'K-S P-값':np.nan,
                         '카이제곱':np.nan,'카이제곱 P-값':np.nan,'매개 변수':pstr(d.name, getattr(d,'p',{})),
                         '_AD':np.inf,'_KS_p':-np.inf,'_Chi2_p':-np.inf,'_n_params':1e9})
            continue
        xe = x[x>0] if d.name in POSITIVE else x
        if len(xe) < 5:
            rows.append({'분포':d.name,'A-D':np.nan,'A-D P-값':np.nan,'K-S':np.nan,'K-S P-값':np.nan,
                         '카이제곱':np.nan,'카이제곱 P-값':np.nan,'매개 변수':pstr(d.name,d.p),
                         '_AD':np.inf,'_KS_p':-np.inf,'_Chi2_p':-np.inf,'_n_params':d.np or 1e9})
            continue
        A2  = AD_stat(xe, d.cdf);    pAD  = AD_p_boot(xe, d, A2)
        D,p = KS_stat_p(xe, d);      chi,pc = Chi2_stat_p(xe, d)
        rows.append({'분포':d.name,'A-D':A2,'A-D P-값':pAD,'K-S':D,'K-S P-값':p,
                     '카이제곱':chi,'카이제곱 P-값':pc,'매개 변수':pstr(d.name,d.p),
                     '_AD':A2,'_KS_p':p,'_Chi2_p':pc,'_n_params':d.np or 1e9})
    df = pd.DataFrame(rows)
    # 정렬키: A-D ↑, KS p ↓, χ² p ↓, #params ↑
    df['_key'] = list(zip(
        df['_AD'].apply(lambda v: v if np.isfinite(v) else np.inf),
        df['_KS_p'].apply(lambda v: -v if np.isfinite(v) else np.inf),
        df['_Chi2_p'].apply(lambda v: -v if np.isfinite(v) else np.inf),
        df['_n_params'].apply(lambda v: v if np.isfinite(v) else np.inf)
    ))
    return df.sort_values('_key', kind='mergesort').drop(columns=['_key']).reset_index(drop=True)

In [32]:
# ---------- 엑셀 작성 ----------
def set_col_widths(ws):
    widths = {'A':18,'B':12,'C':10,'D':10,'E':10,'F':10,'G':12,'H':12,'I':64}
    for col,w in widths.items():
        ws.column_dimensions[col].width = w

def style_header(ws, row, headers):
    blue = PatternFill('solid', fgColor=HEADER_BLUE)
    white = Font(color='FFFFFF', bold=True)
    center = Alignment(horizontal='center', vertical='center', wrap_text=True)
    thin = Border(*(Side(style='thin', color='999999') for _ in range(4)))
    # openpyxl Border init expects keywords; make it explicit:
    thin = Border(left=Side(style='thin', color='999999'),
                  right=Side(style='thin', color='999999'),
                  top=Side(style='thin', color='999999'),
                  bottom=Side(style='thin', color='999999'))
    for j,h in enumerate(headers, start=2):  # B..I
        c = ws.cell(row=row, column=j, value=h)
        c.fill = blue; c.font = white; c.alignment = center; c.border = thin

def write_num(ws, r, c, v):
    cell = ws.cell(row=r, column=c)
    if isinstance(v,(float,np.floating)) and np.isfinite(v):
        cell.value = float(v); cell.number_format = '0.0000'
    elif v is None or (isinstance(v,(float,np.floating)) and (np.isnan(v) or np.isinf(v))):
        cell.value = '---'
    else:
        cell.value = v

# ----- 실행 -----
df = pd.read_excel(DATA_PATH)

# 금속 후보 탐지 & Con(Concentration)류 제거
metals = detect_numeric_metal_columns(df)
EXCLUDE_RE = re.compile(r'^\s*con(c|centration)?(\s*\(.*\))?\s*$', re.I)
metals = [m for m in metals if not EXCLUDE_RE.match(str(m))]
if not metals:
    raise ValueError('숫자형(금속) 컬럼을 찾지 못했습니다.')

# 적합
results = {m: fit_one_col(df[m].values) for m in metals}

# 워크북/시트
wb = Workbook()
ws = wb.active
ws.title = '데이터 계열'
ws['A1'] = '순위 지정 기준: 앤더슨-달링'
ws['A2'] = '데이터 계열'
set_col_widths(ws)
ws.freeze_panes = 'B3'

headers = ['분포','A-D','A-D P-값','K-S','K-S P-값','카이제곱','카이제곱 P-값','매개 변수']
row = 3
table_idx = 1
last_data_row = row

for m in metals:
    # 1) 그룹 타이틀(굵게만)
    ws.cell(row=row, column=1, value=m).font = Font(bold=True)
    row += 1

    # 2) 헤더는 텍스트만 써 놓고, 색/필터/줄무늬는 '테이블 스타일'로 처리
    for j, h in enumerate(headers, start=2):  # B..I
        ws.cell(row=row, column=j, value=h)
    start_row = row
    row += 1

    # 3) 데이터 쓰기(값은 그대로)
    tbl = results[m]
    for i, r_ in tbl.iterrows():
        rr = row + i
        ws.cell(row=rr, column=2, value=r_['분포'])
        write_num(ws, rr, 3, r_['A-D'])
        write_num(ws, rr, 4, r_['A-D P-값'])
        write_num(ws, rr, 5, r_['K-S'])
        write_num(ws, rr, 6, r_['K-S P-값'])
        write_num(ws, rr, 7, r_['카이제곱'])
        write_num(ws, rr, 8, r_['카이제곱 P-값'])
        ws.cell(row=rr, column=9, value=r_['매개 변수'])

    end_row = row + len(tbl) - 1
    last_data_row = max(last_data_row, end_row)

    # 4) 테이블 스타일 적용 → 파란 헤더/필터/줄무늬 자동
    ref = f"B{start_row}:I{end_row}"
    t = Table(displayName=f"T_{table_idx}", ref=ref)
    t.tableStyleInfo = TableStyleInfo(
        name="TableStyleMedium9",     # 스샷과 유사한 진청 헤더
        showFirstColumn=False,
        showLastColumn=False,
        showRowStripes=True,
        showColumnStripes=False
    )
    ws.add_table(t)
    table_idx += 1

    # 5) 다음 블록과 한 줄 띄우기 (간격 좁히려면 +1 또는 0으로)
    row = end_row + 2

# 저장 & 다운로드
wb.save(OUTPUT_PATH)
print('Saved:', OUTPUT_PATH)
files.download(OUTPUT_PATH)

Saved: Tx-적합도.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>