In [1]:
import pandas as pd
import numpy as np

from google.colab import files
uploaded = files.upload()

Saving 202501_기준금속.xlsx to 202501_기준금속.xlsx


In [16]:
excel_path = '202501_기준금속.xlsx'
df = pd.read_excel(excel_path)

In [17]:
# 금속 컬럼 추출
column_pattern = ['(ng/m3)', '(ug/m3)']
columns_to_analyze = [c for c in df.columns if any(p in c for p in column_pattern)]

# 선택되지 않을 때 숫자형 컬럼 전체로 대체
if not columns_to_analyze:
    columns_to_analyze = df.select_dtypes(include=[np.number]).columns.tolist()

print("Columns selected:", len(columns_to_analyze))
print(columns_to_analyze[:12])

Columns selected: 8
['Conc(ug/m3)', 'Cr(ng/m3)', 'Co(ng/m3)', 'Ni(ng/m3)', 'As(ng/m3)', 'Cd(ng/m3)', 'Sb(ng/m3)', 'Pb(ng/m3)']


In [18]:
# 음수값 Nan으로 처리
df[columns_to_analyze] = df[columns_to_analyze].mask(df[columns_to_analyze] < 0)

In [19]:
# 최빈값 계산, 여러 개면 가장 작은 값 반환
def safe_mode(series):
    s = series.dropna() # Nan 값 제거
    if s.empty:
        return np.nan
    modes = s.mode() # 최빈값 구하기
    return modes.min() if not modes.empty else np.nan

# 백분위 계산
def nanpercentile(series, q):
    arr = series.dropna().to_numpy() # Nan 값 제거
    if arr.size == 0:
        return np.nan
    return float(np.percentile(arr, q, method='linear'))

In [20]:
# 통계량 계산
def compute_stats(s):
    x = s.dropna() # Nan값 제거
    n = len(x) # 유효 데이터 개수
    if n == 0:
        return dict.fromkeys(['Mean (N)','Std. Error of Mean','Mode','Std. Deviation',
                              'Range','Maximum','Median','Minimum','Sum',
                              'P5','P25','P50','P75','P95','P100'], np.nan)

    mean = x.mean() # 평균
    std = x.std(ddof=1) # 표본 표준편차
    sem = std / np.sqrt(n) # 평균의 표준오차
    xmin, xmax = x.min(), x.max() # 최소값, 최대값

    return {
        'Mean (N)': f"{mean:.3f} (N={n})",
        'Std. Error of Mean': sem,
        'Mode': safe_mode(x), # 최빈값
        'Std. Deviation': std, # 표준편차
        'Range': xmax - xmin, # 범위
        'Maximum': xmax,
        'Median': x.median(), # 중앙값
        'Minimum': xmin,
        'Sum': x.sum(),
        'P5': nanpercentile(x, 5), # 5% 백분위수
        'P25': nanpercentile(x, 25), # 25% 백분위수
        'P50': nanpercentile(x, 50), # 50% 백분위수
        'P75': nanpercentile(x, 75), # 75% 백분위수
        'P95': nanpercentile(x, 95), # 95% 백분위수
        'P100': nanpercentile(x, 100) # 100% 백분위수
    }

In [22]:
# 모든 대상 컬럼에 대해 통계 계산
stats_dict = {col: compute_stats(df[col]) for col in columns_to_analyze}

# 표 형태로 변환
stats_table = pd.DataFrame(stats_dict)

# 행 순서 고정
row_order = ['Mean (N)','Std. Error of Mean','Mode','Std. Deviation','Range',
             'Maximum','Median','Minimum','Sum','P5','P25','P50','P75','P95','P100']
stats_table = stats_table.reindex(row_order)

# 소수점 반올림 후 표시
stats_table_rounded = stats_table.round(3)
for row in stats_table_rounded.index:
    if row != 'Mean (N)':
        stats_table_rounded.loc[row] = stats_table_rounded.loc[row].astype(float).round(3)

# 표 출력
from IPython.display import display
display(stats_table_rounded)

Unnamed: 0,Conc(ug/m3),Cr(ng/m3),Co(ng/m3),Ni(ng/m3),As(ng/m3),Cd(ng/m3),Sb(ng/m3),Pb(ng/m3)
Mean (N),17.064 (N=515),3.065 (N=405),8.853 (N=504),2.986 (N=378),0.026 (N=504),215.833 (N=504),17.861 (N=267),10.785 (N=450)
Std. Error of Mean,0.464,0.082,0.243,0.094,0.015,2.284,0.852,0.489
Mode,13.72,3.29,2.51,1.4,0.0,135.75,0.04,1.39
Std. Deviation,10.524,1.643,5.46,1.831,0.333,51.286,13.925,10.383
Range,44.7,8.64,36.86,8.66,6.35,267.44,66.58,56.89
Maximum,47.62,8.67,38.57,8.66,6.35,378.53,66.62,56.89
Median,13.72,3.12,8.32,2.655,0.0,211.845,16.13,7.03
Minimum,2.92,0.03,1.71,0.0,0.0,111.09,0.04,0.0
Sum,8787.87,1241.49,4461.99,1128.63,13.25,108779.87,4768.96,4853.05
P5,5.054,0.274,2.51,0.433,0.0,144.451,1.26,0.564


In [23]:
# 엑셀 저장 코드
stats_table_rounded.to_excel("Statistics_Table_.xlsx", index=True)

from google.colab import files
files.download("Statistics_Table_.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>