In [3]:
# IPython analysis: inspect large NPY and CSV files safely
# Prints are in English. Comments are in Korean.
# 주의: 대용량 파일은 전체 로딩하지 않고 앞부분 샘플만 사용하여 통계를 계산합니다.

import numpy as np
import pandas as pd
from pathlib import Path

# 설정
SAMPLE_N = 1000  # 각 NPY 앞부분 샘플 크기
BASE = Path("..")

# 점검 대상 파일 목록
targets_npy = [
    ("X_lstm_gas", BASE / "data" / "processed" / "X_lstm_gas.npy"),
    ("y_lstm_gas", BASE / "data" / "processed" / "y_lstm_gas.npy"),
    ("X_lstm_hotwater", BASE / "data" / "processed" / "X_lstm_hotwater.npy"),
    ("y_lstm_hotwater", BASE / "data" / "processed" / "y_lstm_hotwater.npy"),
    ("X_lstm_steam", BASE / "data" / "processed" / "X_lstm_steam.npy"),
    ("y_lstm_steam", BASE / "data" / "processed" / "y_lstm_steam.npy"),
    ("x_lstm_chilledwater", BASE / "data" / "processed" / "x_lstm_chilledwater.npy"),
    ("y_lstm_chilledwater", BASE / "data" / "processed" / "y_lstm_chilledwater.npy"),
    ("x_lstm_electricity", BASE / "data" / "processed" / "y_lstm_electricity.npy"),
    ("y_lstm_electricity", BASE / "data" / "processed" / "y_lstm_electricity.npy"),
]

target_csv = ("preds_all", BASE / "results" / "preds_all.csv")

def sample_stats_from_path(path, sample_n=1000):
    """파일 경로에서 mmap으로 shape만 읽고, 앞부분만 샘플로 통계 계산"""
    mm = np.load(path, mmap_mode="r")  # 전체 메모리 로딩 회피
    shape = mm.shape
    n = shape[0] if shape else 0
    k = min(n, sample_n)
    sample = mm[:k]

    stats = {
        "shape": shape,
        "sample_n": int(k),
        "min": float(np.nanmin(sample)),
        "max": float(np.nanmax(sample)),
        "mean": float(np.nanmean(sample)),
        "std": float(np.nanstd(sample)),
    }

    # 2차원 배열이면 열별 통계도 간단히 제공
    if sample.ndim == 2 and sample.shape[1] <= 8:
        stats["col_means"] = [float(x) for x in np.nanmean(sample, axis=0).tolist()]
        stats["col_stds"] = [float(x) for x in np.nanstd(sample, axis=0).tolist()]

    # 3차원 배열이면 첫 시퀀스의 앞 타임스텝과 일부 피처를 미리보기로 제공
    if sample.ndim == 3:
        preview = sample[0, : min(3, sample.shape[1]), : min(5, sample.shape[2])]
        stats["preview_first_seq"] = preview.tolist()
    return stats

# NPY 요약 수집
rows = []
for name, path in targets_npy:
    info = {"name": name, "exists": path.exists(), "path": str(path)}
    if not path.exists():
        info["error"] = "File not found"
    else:
        try:
            s = sample_stats_from_path(path, SAMPLE_N)
            info.update(s)
        except Exception as e:
            info["error"] = f"{type(e).__name__}: {e}"
    rows.append(info)

npy_df = pd.DataFrame(rows)
print("NPY summary")
display(npy_df)

# CSV 요약
if target_csv[1].exists():
    df = pd.read_csv(target_csv[1])
    print("preds_all.csv head")
    display(df.head(20))

    # 숫자 열 기본 통계
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    if num_cols:
        print("preds_all.csv numeric describe")
        display(df[num_cols].describe().T)

    # y_true, y_pred 집중 점검
    focus = ["y_true", "y_pred"]
    if set(focus).issubset(df.columns):
        print("y_true and y_pred describe")
        display(df[focus].describe().T)
else:
    print("preds_all.csv not found")


NPY summary


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Unnamed: 0,name,exists,path,shape,sample_n,min,max,mean,std,preview_first_seq,col_means,col_stds,error
0,X_lstm_gas,True,../data/processed/X_lstm_gas.npy,"(14821, 48, 13)",1000.0,-1.967773,3.564453,inf,inf,"[[0.5029296875, 1.3779296875, 0.34619140625, -...",,,
1,y_lstm_gas,True,../data/processed/y_lstm_gas.npy,"(14821, 2)",1000.0,0.0,2879.182861,13.50569,109.8699,,"[12.88477611541748, 14.126591682434082]","[155.28045654296875, 5.473419666290283]",
2,X_lstm_hotwater,True,../data/processed/X_lstm_hotwater.npy,"(59828, 48, 13)",1000.0,-5.742188,4.765625,inf,inf,"[[-2.529296875, -2.11328125, 2.71484375, 0.883...",,,
3,y_lstm_hotwater,True,../data/processed/y_lstm_hotwater.npy,"(59828, 2)",1000.0,-12.8,5791.677246,99.92674,448.2293,,"[190.7774200439453, 9.07610034942627]","[620.6821899414062, 7.9859795570373535]",
4,X_lstm_steam,True,../data/processed/X_lstm_steam.npy,"(160805, 48, 13)",1000.0,-5.675781,16.890625,0.07250977,inf,"[[-1.1376953125, -0.68212890625, -0.0867309570...",,,
5,y_lstm_steam,True,../data/processed/y_lstm_steam.npy,"(160805, 2)",1000.0,-15.0,584647.4375,22984.08,71023.23,,"[45963.66796875, 4.4395036697387695]","[95039.375, 6.509329319000244]",
6,x_lstm_chilledwater,False,../data/processed/x_lstm_chilledwater.npy,,,,,,,,,,File not found
7,y_lstm_chilledwater,True,../data/processed/y_lstm_chilledwater.npy,"(260762, 2)",1000.0,-10.0,65250.0,1345.322,6080.553,,"[2693.629638671875, -2.9834072589874268]","[8385.1259765625, 4.557618141174316]",
8,x_lstm_electricity,True,../data/processed/y_lstm_electricity.npy,"(451099, 2)",1000.0,-16.700001,298654.28125,2684.852,18062.33,,"[5369.41015625, 0.2920995354652405]","[25260.306640625, 7.404330730438232]",
9,y_lstm_electricity,True,../data/processed/y_lstm_electricity.npy,"(451099, 2)",1000.0,-16.700001,298654.28125,2684.852,18062.33,,"[5369.41015625, 0.2920995354652405]","[25260.306640625, 7.404330730438232]",


preds_all.csv head


Unnamed: 0,meter_type,seq_index,y_true,y_pred
0,electricity,0,0.0,-503.166016
1,electricity,1,0.0,-277.533203
2,electricity,2,0.0,-567.925781
3,electricity,3,0.0,273.46875
4,electricity,4,162.642395,-530.744141
5,electricity,5,0.0,271.255859
6,electricity,6,0.0,448.353516
7,electricity,7,239.636002,1572.355469
8,electricity,8,158.194901,1022.667969
9,electricity,9,12.89,1630.423828


preds_all.csv numeric describe


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seq_index,947315.0,158945.799855,121715.064235,0.0,55501.5,133002.0,237515.5,451098.0
y_true,947315.0,19341.570089,89973.859311,0.0,2.8448,117.949203,757.08197,2504613.5
y_pred,947315.0,18152.008727,36686.243752,-96910.085938,987.16095,4069.396484,25371.802734,2501333.25


y_true and y_pred describe


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
y_true,947315.0,19341.570089,89973.859311,0.0,2.8448,117.949203,757.08197,2504613.5
y_pred,947315.0,18152.008727,36686.243752,-96910.085938,987.16095,4069.396484,25371.802734,2501333.25
