In [1]:
import pandas as pd
import numpy as np

from google.colab import files
uploaded = files.upload()

Saving 202501.xlsx to 202501.xlsx


In [2]:
excel_path = '202501.xlsx'
df = pd.read_excel(excel_path)

In [3]:
# 금속 컬럼 추출
column_pattern = ['(ng/m3)', '(ug/m3)']
columns_to_analyze = [c for c in df.columns if any(p in c for p in column_pattern)]

# 선택되지 않을 때 숫자형 컬럼 전체로 대체
if not columns_to_analyze:
    columns_to_analyze = df.select_dtypes(include=[np.number]).columns.tolist()

print("Columns selected:", len(columns_to_analyze))
print(columns_to_analyze[:12])

Columns selected: 36
['Conc(ug/m3)', 'Al(ng/m3)', 'Si(ng/m3)', 'S(ng/m3)', 'K(ng/m3)', 'Ca(ng/m3)', 'Ti(ng/m3)', 'V(ng/m3)', 'Cr(ng/m3)', 'Mn(ng/m3)', 'Fe(ng/m3)', 'Co(ng/m3)']


In [4]:
# 음수값 Nan으로 처리
df[columns_to_analyze] = df[columns_to_analyze].mask(df[columns_to_analyze] < 0)

In [5]:
# 최빈값 계산, 여러 개면 가장 작은 값 반환
def safe_mode(series):
    s = series.dropna() # Nan 값 제거
    if s.empty:
        return np.nan
    modes = s.mode() # 최빈값 구하기
    return modes.min() if not modes.empty else np.nan

# 백분위 계산
def nanpercentile(series, q):
    arr = series.dropna().to_numpy() # Nan 값 제거
    if arr.size == 0:
        return np.nan
    return float(np.percentile(arr, q, method='linear'))

In [10]:
# 통계량 계산
def compute_stats(s):
    x = s.dropna() # Nan값 제거
    n = len(x) # 유효 데이터 개수
    if n == 0:
        return dict.fromkeys(['Mean','Std. Error of Mean','Mode','Std. Deviation',
                              'Range','Minimum','Median','Maximum','Sum',
                              'P5','P25','P50','P75','P95','P100'], np.nan)

    mean = x.mean() # 평균
    std = x.std(ddof=1) # 표본 표준편차
    sem = std / np.sqrt(n) # 평균의 표준오차
    xmin, xmax = x.min(), x.max() # 최소값, 최대값

    return {
        'Mean': mean,
        'Std. Error of Mean': sem,
        'Mode': safe_mode(x), # 최빈값
        'Std. Deviation': std, # 표준편차
        'Range': xmax - xmin, # 범위
        'Minimum': xmin,
        'Median': x.median(), # 중앙값
        'Maximum': xmax,
        'Sum': x.sum(),
        'P5': nanpercentile(x, 5), # 5% 백분위수
        'P25': nanpercentile(x, 25), # 25% 백분위수
        'P50': nanpercentile(x, 50), # 50% 백분위수
        'P75': nanpercentile(x, 75), # 75% 백분위수
        'P95': nanpercentile(x, 95), # 95% 백분위수
        'P100': nanpercentile(x, 100) # 100% 백분위수
    }

In [12]:
# 모든 대상 컬럼에 대해 통계 계산
stats_dict = {col: compute_stats(df[col]) for col in columns_to_analyze}

# 표 형태로 변환
stats_table = pd.DataFrame(stats_dict)

# 행 순서 고정
row_order = ['Mean','Std. Error of Mean','Mode','Std. Deviation','Range',
             'Minimum','Median','Maximum','Sum','P5','P25','P50','P75','P95','P100']
stats_table = stats_table.reindex(row_order)

# 소수점 반올림 후 표시
stats_table_rounded = stats_table.round(3)

# 표 출력
from IPython.display import display
display(stats_table_rounded)

Unnamed: 0,Conc(ug/m3),Al(ng/m3),Si(ng/m3),S(ng/m3),K(ng/m3),Ca(ng/m3),Ti(ng/m3),V(ng/m3),Cr(ng/m3),Mn(ng/m3),...,Cd(ng/m3),In(ng/m3),Sn(ng/m3),Sb(ng/m3),Te(ng/m3),Cs(ng/m3),Ba(ng/m3),Ce(ng/m3),Pb(ng/m3),Bi(ng/m3)
Mean,17.064,535.975,1165.064,4073.718,311.863,143.559,2.128,3.046,3.065,6.715,...,215.833,27.986,13.262,17.861,16.028,5.035,2.222,5.792,10.785,3.746
Std. Error of Mean,0.464,9.582,40.235,71.223,6.22,2.899,0.185,0.078,0.082,0.169,...,2.284,0.817,0.541,0.852,0.816,0.446,0.315,0.406,0.489,0.109
Mode,13.72,166.26,903.08,4337.73,206.35,96.0,0.88,3.14,3.29,0.23,...,135.75,0.11,6.94,0.04,0.74,0.05,0.0,0.29,1.39,3.78
Std. Deviation,10.524,215.108,903.284,1598.947,139.649,65.078,3.311,1.744,1.643,3.555,...,51.286,17.897,9.884,13.925,13.006,6.465,7.065,7.818,10.383,2.224
Range,44.7,1756.45,8911.15,9116.3,943.61,637.76,27.96,10.88,8.64,16.7,...,267.44,108.47,49.24,66.58,64.07,32.13,63.89,51.52,56.89,12.22
Minimum,2.92,166.26,594.85,1094.29,122.71,78.88,0.0,0.01,0.03,0.0,...,111.09,0.11,0.0,0.04,0.12,0.0,0.0,0.01,0.0,0.01
Median,13.72,510.645,943.235,3633.105,285.53,132.65,1.2,2.86,3.12,7.355,...,211.845,26.15,11.795,16.13,12.445,2.425,0.0,2.935,7.03,3.63
Maximum,47.62,1922.71,9506.0,10210.59,1066.32,716.64,27.96,10.89,8.67,16.7,...,378.53,108.58,49.24,66.62,64.19,32.13,63.89,51.53,56.89,12.23
Sum,8787.87,270131.32,587192.15,2053153.73,157178.79,72353.61,683.04,1526.07,1241.49,2954.51,...,108779.87,13433.42,4429.59,4768.96,4071.01,1057.41,1119.69,2143.14,4853.05,1565.8
P5,5.054,275.793,656.102,2212.358,146.0,94.458,0.08,0.54,0.274,0.429,...,144.451,4.12,1.236,1.26,0.927,0.149,0.0,0.204,0.564,0.498


In [8]:
# 엑셀 저장 코드
stats_table_rounded.to_excel("Statistics_Table.xlsx", index=True)

from google.colab import files
files.download("Statistics_Table.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>