In [1]:
BUCKET     = "nyang-ml-apne2-dev"
CSV_PREFIX = "ml/inputs/public-dataset/"
OUT_KEY    = "ml/inputs/public-dataset/daily_feature_public.parquet"

In [2]:
import sys, io, boto3, pandas as pd
sys.path.insert(0, ".")
from src.steps.features import run

PUBLIC_COL_RENAME = {
    "ts_raw": "timestamp",
    "CellTower_LAC": "cell_lac",
    "Wifi_SSID": "wifi_ssid",
    "Acc_Avg": "step_count",
}

def normalize(df):
    df = df.rename(columns={k: v for k, v in PUBLIC_COL_RENAME.items() if k in df.columns})
    if "sensor_id" in df.columns:
        df["event_name"] = df["sensor_id"].astype(str).str.upper()
    df["uuid"] = df["uuid"].astype(str)
    return df

s3   = boto3.client("s3")
keys = [
    o["Key"] for o in
    s3.list_objects_v2(Bucket=BUCKET, Prefix=CSV_PREFIX)["Contents"]
    if o["Key"].endswith(".csv")
]
print(f"CSV 파일 수: {len(keys)}")

feat_frames = []
for i, key in enumerate(sorted(keys), 1):
    try:
        obj  = s3.get_object(Bucket=BUCKET, Key=key)
        raw  = pd.read_csv(io.BytesIO(obj["Body"].read()), low_memory=False)
        feat = run(normalize(raw))
        feat_frames.append(feat)
        if i % 50 == 0 or i == len(keys):
            print(f"[{i}/{len(keys)}] {key.split('/')[-1]} → {feat.shape}")
    except Exception as e:
        print(f"⚠️ [{i}] {key}: {e}")

result = pd.concat(feat_frames, ignore_index=True)
print(f"\n완료: {result.shape}, uuid: {result['uuid'].nunique()}명")

CSV 파일 수: 342
[FE] input: (48383, 81), users: 1
[FE] output: (23, 55)
[FE] input: (60511, 81), users: 1
[FE] output: (22, 55)
[FE] input: (101837, 81), users: 1
[FE] output: (33, 55)
[FE] input: (50625, 81), users: 1
[FE] output: (28, 55)
[FE] input: (149679, 81), users: 1
[FE] output: (32, 55)
[FE] input: (194742, 81), users: 1
[FE] output: (33, 55)
[FE] input: (65032, 81), users: 1
[FE] output: (27, 55)
[FE] input: (136214, 81), users: 1
[FE] output: (14, 55)
[FE] input: (72697, 81), users: 1
[FE] output: (23, 55)
[FE] input: (86403, 81), users: 1
[FE] output: (25, 55)
[FE] input: (72078, 81), users: 1
[FE] output: (28, 55)
[FE] input: (101909, 81), users: 1
[FE] output: (22, 55)
[FE] input: (23520, 81), users: 1
[FE] output: (16, 55)
[FE] input: (120522, 81), users: 1
[FE] output: (20, 55)
[FE] input: (95929, 81), users: 1
[FE] output: (21, 55)
[FE] input: (78859, 81), users: 1
[FE] output: (33, 55)
[FE] input: (36807, 81), users: 1
[FE] output: (20, 55)
[FE] input: (61497, 81), use

  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (34, 55)
[FE] input: (149460, 81), users: 1
[FE] output: (32, 55)
[FE] input: (89315, 81), users: 1
[FE] output: (22, 55)
[FE] input: (57326, 81), users: 1
[FE] output: (31, 55)
[FE] input: (105006, 81), users: 1
[FE] output: (26, 55)
[FE] input: (140331, 81), users: 1
[FE] output: (22, 55)
[FE] input: (109678, 81), users: 1
[FE] output: (29, 55)
[FE] input: (83637, 81), users: 1
[FE] output: (32, 55)
[FE] input: (135011, 81), users: 1
[FE] output: (29, 55)
[FE] input: (94791, 81), users: 1
[FE] output: (23, 55)
[FE] input: (84241, 81), users: 1
[FE] output: (25, 55)
[100/342] u4b9145.csv → (25, 55)
[FE] input: (75366, 81), users: 1
[FE] output: (30, 55)
[FE] input: (98229, 81), users: 1
[FE] output: (25, 55)
[FE] input: (174926, 81), users: 1
[FE] output: (33, 55)
[FE] input: (52419, 81), users: 1
[FE] output: (23, 55)
[FE] input: (131168, 81), users: 1
[FE] output: (33, 55)
[FE] input: (67210, 81), users: 1
[FE] output: (33, 55)
[FE] input: (12601, 81), users: 1
[FE] out

  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (25, 55)
[FE] input: (101787, 81), users: 1
[FE] output: (29, 55)
[FE] input: (200438, 81), users: 1
[FE] output: (23, 55)
[FE] input: (109044, 81), users: 1
[FE] output: (26, 55)
[FE] input: (116513, 81), users: 1
[FE] output: (21, 55)
[FE] input: (142318, 81), users: 1
[FE] output: (25, 55)
[FE] input: (81679, 81), users: 1
[FE] output: (27, 55)
[FE] input: (72673, 81), users: 1
[FE] output: (24, 55)
[FE] input: (54153, 81), users: 1
[FE] output: (22, 55)
[FE] input: (92207, 81), users: 1
[FE] output: (29, 55)
[FE] input: (60200, 81), users: 1
[FE] output: (23, 55)
[FE] input: (63035, 81), users: 1
[FE] output: (32, 55)
[FE] input: (71222, 81), users: 1
[FE] output: (23, 55)
[FE] input: (83905, 81), users: 1
[FE] output: (33, 55)
[FE] input: (7523, 81), users: 1
[FE] output: (19, 55)
[FE] input: (204349, 81), users: 1
[FE] output: (30, 55)
[FE] input: (39380, 81), users: 1
[FE] output: (22, 55)
[FE] input: (145531, 81), users: 1
[FE] output: (30, 55)
[FE] input: (135632,

  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (66, 55)
[FE] input: (138707, 81), users: 1
[FE] output: (26, 55)
[FE] input: (169417, 81), users: 1
[FE] output: (29, 55)
[FE] input: (135462, 81), users: 1
[FE] output: (26, 55)
[FE] input: (64794, 81), users: 1
[FE] output: (21, 55)
[FE] input: (124978, 81), users: 1
[FE] output: (31, 55)
[FE] input: (90987, 81), users: 1


  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (20, 55)
[FE] input: (132339, 81), users: 1
[FE] output: (33, 55)
[FE] input: (104120, 81), users: 1
[FE] output: (30, 55)
[FE] input: (95906, 81), users: 1
[FE] output: (33, 55)
[FE] input: (44879, 81), users: 1


  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (19, 55)
[FE] input: (67499, 81), users: 1
[FE] output: (34, 55)
[FE] input: (95686, 81), users: 1
[FE] output: (29, 55)
[FE] input: (72418, 81), users: 1
[FE] output: (33, 55)
[FE] input: (149312, 81), users: 1
[FE] output: (24, 55)
[250/342] ubb90d4.csv → (24, 55)
[FE] input: (101975, 81), users: 1
[FE] output: (30, 55)
[FE] input: (57034, 81), users: 1
[FE] output: (22, 55)
[FE] input: (53769, 81), users: 1
[FE] output: (23, 55)
[FE] input: (96925, 81), users: 1
[FE] output: (22, 55)
[FE] input: (103305, 81), users: 1
[FE] output: (25, 55)
[FE] input: (115146, 81), users: 1
[FE] output: (30, 55)
[FE] input: (139269, 81), users: 1
[FE] output: (28, 55)
[FE] input: (68197, 81), users: 1
[FE] output: (27, 55)
[FE] input: (87130, 81), users: 1
[FE] output: (32, 55)
[FE] input: (182604, 81), users: 1
[FE] output: (28, 55)
[FE] input: (97623, 81), users: 1
[FE] output: (32, 55)
[FE] input: (133088, 81), users: 1
[FE] output: (25, 55)
[FE] input: (175018, 81), users: 1
[FE] ou

  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (26, 55)
[FE] input: (21707, 81), users: 1
[FE] output: (23, 55)
[FE] input: (25120, 81), users: 1
[FE] output: (31, 55)
[FE] input: (145147, 81), users: 1
[FE] output: (30, 55)
[FE] input: (72383, 81), users: 1


  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (31, 55)
[FE] input: (101450, 81), users: 1
[FE] output: (36, 55)
[FE] input: (146223, 81), users: 1
[FE] output: (26, 55)
[FE] input: (121962, 81), users: 1
[FE] output: (24, 55)
[FE] input: (61555, 81), users: 1
[FE] output: (21, 55)
[FE] input: (63290, 81), users: 1
[FE] output: (25, 55)
[FE] input: (79476, 81), users: 1
[FE] output: (31, 55)
[FE] input: (32319, 81), users: 1
[FE] output: (33, 55)
[FE] input: (131358, 81), users: 1
[FE] output: (29, 55)
[FE] input: (41609, 81), users: 1
[FE] output: (20, 55)
[FE] input: (98915, 81), users: 1
[FE] output: (72, 55)
[FE] input: (123108, 81), users: 1
[FE] output: (25, 55)
[FE] input: (53759, 81), users: 1
[FE] output: (20, 55)
[FE] input: (50223, 81), users: 1


  df[col] = df[col].fillna(True).astype(bool)
  df[col] = df[col].fillna(True).astype(bool)


[FE] output: (23, 55)
[FE] input: (127974, 81), users: 1
[FE] output: (30, 55)
[FE] input: (160154, 81), users: 1
[FE] output: (54, 55)
[FE] input: (87868, 81), users: 1
[FE] output: (33, 55)
[FE] input: (220722, 81), users: 1
[FE] output: (54, 55)
[342/342] ufff695.csv → (54, 55)

완료: (9057, 55), uuid: 342명


In [3]:
buf = io.BytesIO()
result.to_parquet(buf, index=False)
buf.seek(0)
s3.put_object(Bucket=BUCKET, Key=OUT_KEY, Body=buf.getvalue())
print(f"✅ 업로드 완료: s3://{BUCKET}/{OUT_KEY}")

✅ 업로드 완료: s3://nyang-ml-apne2-dev/ml/inputs/public-dataset/daily_feature_public.parquet
