# 1. 학습 파이프라인 검증
## 1.1 아티팩트 생성 검증

In [2]:
!python src/runtime/train.py \
  --s3-public-feature-uri s3://nyang-ml-apne2-dev/ml/inputs/public-dataset/daily_feature_public.parquet \
  --s3-etl-uri s3://silver-dummy/silver_events/ \
  --lookback-days 30 \
  --s3-model-uri s3://nyang-ml-apne2-dev/ml/artifacts/models/isolation_forest.pkl

[TRAIN] 학습 시작
  공개 feature : s3://nyang-ml-apne2-dev/ml/inputs/public-dataset/daily_feature_public.parquet
  사용자 ETL   : s3://silver-dummy/silver_events/
  lookback     : 30일
[TRAIN] 공개 feature 로드: (9057, 55), uuid: 342명
[TRAIN] 사용자 ETL 30개 파티션 로드 중...
[TRAIN] ETL raw: (185429, 13), uuid: 50명 → FE 실행 중...
[FE] input: (185429, 13), users: 50
[FE] output: (1500, 71)
[TRAIN] ETL feature: (1500, 71)
[TRAIN] 학습 데이터: 공개 342명 + 사용자 50명 = 392명
[TRAIN] IsolationForest 학습 중...
[MODEL] mode=train, input: (10557, 71)
[MODEL] output: (10557, 40)
[TRAIN] S3 저장 완료: s3://nyang-ml-apne2-dev/ml/artifacts/models/isolation_forest.pkl
[TRAIN] 완료 ✅


In [3]:
!aws s3 ls s3://nyang-ml-apne2-dev/ml/artifacts/models/ | grep isolation_forest.pkl

2026-02-20 05:26:38    3326677 isolation_forest.pkl


In [6]:
!python src/runtime/batch_runner.py \
  --target-date 2026-02-19 \
  --model-uri s3://nyang-ml-apne2-dev/ml/artifacts/models/isolation_forest.pkl

[BATCH] 실행: 2026-02-19
  silver       : s3://silver-dummy/silver_events/
  daily-feature: s3://nyang-ml-apne2-dev/ml/daily-feature/
  baseline     : s3://nyang-ml-apne2-dev/ml/baseline/
  model        : s3://nyang-ml-apne2-dev/ml/artifacts/models/isolation_forest.pkl
  output       : s3://nyang-ml-apne2-dev/ml/outputs/
[BATCH] silver 로드: (6165, 13), uuid: 50명 (dt=2026-02-19)
[FE] input: (6165, 13), users: 50
[FE] output: (50, 71)
[BATCH] daily-feature 저장: s3://nyang-ml-apne2-dev/ml/daily-feature/dt=2026-02-19/daily-feature.parquet (50행)
[BATCH] daily-feature 로드: (1605, 71), uuid: 50명 (31일치)
[INFER] 모델 로드 완료: s3://nyang-ml-apne2-dev/ml/artifacts/models/isolation_forest.pkl
[INFER] feat_df: (1605, 71), uuid: 50
[MODEL] mode=infer, input: (1605, 71)
[MODEL] output: (1605, 40)
[DECODER] input: (1605, 40)
[DECODER] output: (1605, 12)
[DECODER] cat_state dist:
{'NO_DATA': 593, 'STABLE': 591, 'CHAOS': 157, 'LETHARGY': 114, 'SLEEP': 113, 'TRAVEL': 37}
[INFER] 완료: (1605, 12)
[BATCH] baseline 저장

In [4]:
import pandas as pd, boto3, io

s3 = boto3.client("s3")

obj = s3.get_object(
    Bucket="nyang-ml-apne2-dev",
    Key="ml/outputs/dt=2026-02-19/state_out.csv"
)

df = pd.read_csv(io.BytesIO(obj["Body"].read()))
df.head()

Unnamed: 0,uuid,date,cat_state,notify_final,notify_level,decoder_quality,risk_used,risk_score,final_risk,risk_band,top_z_feature,top_z_value
0,00fef27e-ca18-4b55-aba3-d91d4b59e05b,2026-02-19,STABLE,NONE,NONE,OK,0.356939,0.356939,0.356939,SAFE,Screen_z,1.979805
1,02b1ebd0-a03d-4a87-ab53-1c1dbecb3872,2026-02-19,CHAOS,NONE,NONE,OK,0.324701,0.324701,0.324701,SAFE,gap_max_z,-1.613382
2,04d38ed5-675a-4115-b3c7-49e2ba6e952a,2026-02-19,SLEEP,NONE,NONE,OK,0.36833,0.36833,0.36833,SAFE,hour_entropy_z,-1.686945
3,0dcbc944-2308-42b5-b4bc-bcc465888f87,2026-02-19,CHAOS,NONE,NONE,OK,0.381067,0.381067,0.381067,SAFE,gap_long_ratio_z,1.740926
4,128bea7a-f734-4ab3-aea1-805603f63d58,2026-02-19,STABLE,NONE,NONE,OK,0.400683,0.400683,0.400683,SAFE,Screen_z,-1.382151


In [5]:
df["cat_state"].value_counts()

cat_state
STABLE      28
NO_DATA     10
CHAOS        6
SLEEP        3
LETHARGY     3
Name: count, dtype: int64

In [7]:
import pandas as pd

bpath = "s3://nyang-ml-apne2-dev/ml/baseline/dt=2026-02-19/baseline.parquet"
b = pd.read_parquet(bpath)

print("baseline shape:", b.shape)
print("columns:", b.columns.tolist())

# 1) READY 분포 (baseline_ready / early_ready / cold_stage 확인)
show_cols = [c for c in ["uuid","baseline_ready","early_ready","cold_stage"] if c in b.columns]
print("\n[READY CHECK]")
print(b[show_cols].head(10))
for c in ["baseline_ready","early_ready","cold_stage"]:
    if c in b.columns:
        print("\n", c, "\n", b[c].value_counts(dropna=False))

# 2) 결측/0분산(=std=0) 같은 위험 신호 확인
num_cols = b.select_dtypes(include="number").columns.tolist()
if num_cols:
    print("\n[NUM NULL CHECK]")
    print(b[num_cols].isna().sum().sort_values(ascending=False).head(20))

    print("\n[NUM DESCRIBE]")
    print(b[num_cols].describe().T.head(20))

# 3) (핵심) baseline_ready=True 인 유저가 실제로 risk를 사용했는지 교차 확인
out = pd.read_csv("s3://nyang-ml-apne2-dev/ml/outputs/dt=2026-02-19/state_out.csv")
m = out.merge(b, on="uuid", how="left", suffixes=("","_b"))

cols = [c for c in ["uuid","cat_state","risk_used","baseline_ready","early_ready","cold_stage"] if c in m.columns]
print("\n[CROSS CHECK: risk_used vs baseline_ready]")
print(m[cols].head(15))
if "baseline_ready" in m.columns:
    print("\nmean(risk_used) by baseline_ready")
    print(m.groupby("baseline_ready")["risk_used"].mean())

baseline shape: (50, 6)
columns: ['uuid', 'date', 'baseline_ready', 'early_ready', 'cold_stage', 'dt']

[READY CHECK]
                                   uuid  baseline_ready  early_ready  \
0  00fef27e-ca18-4b55-aba3-d91d4b59e05b            True         True   
1  02b1ebd0-a03d-4a87-ab53-1c1dbecb3872            True         True   
2  04d38ed5-675a-4115-b3c7-49e2ba6e952a            True         True   
3  0dcbc944-2308-42b5-b4bc-bcc465888f87            True         True   
4  128bea7a-f734-4ab3-aea1-805603f63d58            True         True   
5  26c4539a-82f2-4ad3-b9fe-210eb62169ee            True         True   
6  293dba26-ce5a-411f-9a4d-aa48c20d130f            True         True   
7  2c28dbdb-c8fe-4ea6-8ac5-c2595c1212a1            True         True   
8  404c3930-7d7d-47b4-bf70-b15b2666d088            True         True   
9  44082dbc-b071-70c4-4794-81b840c61c4e            True         True   

  cold_stage  
0      READY  
1      READY  
2      READY  
3      READY  
4      READY  