# User Behavior 데이터 전처리

이 노트북은 Object Storage(S3)에서 Parquet 파일을 읽어와 전처리를 수행하고 결과를 저장합니다.
* 단계: S3 설정 → 파일 로딩 → 전처리 → 로컬/Parquet 저장*

In [None]:
import os
import io
import boto3
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1) S3 클라이언트 설정
ACCESS_KEY = os.environ['S3_ACCESS_KEY_ID']  # 환경변수: S3 접근 키
SECRET_KEY = os.environ['S3_SECRET_ACCESS_KEY']  # 환경변수: S3 비밀 키

s3 = boto3.client(
    service_name='s3',
    region_name='kr-central-2',
    endpoint_url='https://objectstorage.kr-central-2.kakaocloud.com',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY
)

# 2) S3에서 Parquet 읽기bucket_name = 'data-catalog-bucket'prefix = 'data-catalog-dir/user_behavior_prediction/'resp = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)keys = [o['Key'] for o in resp.get('Contents', []) if o['Key'].endswith('.parquet')]if not keys:    raise ValueError(f"No parquet files under prefix {prefix!r}")dfs = []for key in keys:    obj = s3.get_object(Bucket=bucket_name, Key=key)    buf = io.BytesIO(obj['Body'].read())    dfs.append(pq.read_table(buf).to_pandas())# 모두 합치기df_all = pd.concat(dfs, ignore_index=True)print(f"Loaded {len(keys)} files, total rows: {len(df_all)}")# 3) 기본 전처리# 3-1) 로그인 여부 플래그 생성df_all['is_logged_in'] = df_all.get('user_id', '').replace('', 'unknown').ne('unknown').astype(int)# 3-2) age 세그먼트YOUNG_THRESHOLD = 25MIDDLE_THRESHOLD = 50def age_segment(val):    try:        age_int = int(float(val))    except (ValueError, TypeError):        return 'unknown'    if age_int < YOUNG_THRESHOLD:        return 'young'    elif age_int < MIDDLE_THRESHOLD:        return 'middle'    else:        return 'old'if 'age' in df_all.columns:    df_all['age_segment'] = df_all['age'].apply(age_segment)    df_all = df_all[df_all['age_segment'] != 'unknown']else:    df_all = df_all[False]  # age 없으면 모두 제거# 3-3) gender one-hotif 'gender' in df_all.columns:    df_all['gender'] = df_all['gender'].fillna('unknown')    df_all = pd.get_dummies(df_all, columns=['gender'], prefix='gender')else:    df_all['gender_unknown'] = 1# 4) 불필요 원본 컬럼 제거drop_cols = [    'session_id','timestamp','query_params','current_product_id',    'user_id','search_keyword','product_id','next_state','age']df_all.drop(columns=[c for c in drop_cols if c in df_all.columns], errors='ignore', inplace=True)# 5) current_state one-hot & 저빈도 제거if 'current_state' in df_all.columns:    df_all = pd.get_dummies(df_all, columns=['current_state'], prefix='state')    state_cols = [c for c in df_all.columns if c.startswith('state_')]    freq = df_all[state_cols].sum()    low = freq[freq <= 0.01 * len(df_all)].index.tolist()    df_all.drop(columns=low, inplace=True)else:    print('[Warn] current_state 컬럼이 없습니다.')# 6) 수치형 처리: 결측치, 로그 변환, 스케일링num_cols = df_all.select_dtypes(include=[np.number]).columns.tolist()df_all[num_cols] = df_all[num_cols].fillna(0)for col in ['last_action_elapsed','avg_response_time','session_duration']:    if col in df_all.columns:        df_all[col] = np.log1p(df_all[col])scale_cols = [c for c in ['search_count','cart_item_count','page_depth'] if c in df_all.columns] + [c for c in ['last_action_elapsed','avg_response_time','session_duration'] if c in df_all.columns]if scale_cols:    scaler = StandardScaler()    df_all[scale_cols] = scaler.fit_transform(df_all[scale_cols])# 7) 결과 저장output_csv = 'processed_user_behavior.csv'df_all.to_csv(output_csv, index=False)print(f'[Success] 로컬에 저장: {output_csv}')# 2) S3에 업로드output_prefix = 'data-catalog-dir/preprocessed/'output_filename = 'preprocessed_user_behavior_prediction.parquet'buf = io.BytesIO()df_all.to_parquet(buf, index=False)buf.seek(0)s3.put_object(Bucket=bucket_name, Key=f'{output_prefix}{output_filename}', Body=buf.getvalue())print(f'[Success] S3에 업로드 완료: s3://{bucket_name}/{output_prefix}{output_filename}')