## Import

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
import numpy as np

In [31]:
data_dir = Path('/content/drive/MyDrive/toss/data')
dataset_id = 'toss_ctr_v1'
out_dir = data_dir / dataset_id
out_dir.mkdir(parents=True, exist_ok=True)

In [8]:
all_train = pd.read_parquet(data_dir / 'train.parquet', engine='pyarrow')
test_df = pd.read_parquet(data_dir / 'test.parquet', engine='pyarrow').drop(columns=['ID'], errors='ignore')

print("Train shape:", all_train.shape)
print("Test shape:", test_df.shape)

Train shape: (10704179, 119)
Test shape: (1527298, 118)


In [23]:
label_col = 'clicked'
pos = all_train[all_train[label_col] == 1]
neg = all_train[all_train[label_col] == 0].sample(n=len(pos)*2, random_state=42)
train_bal = pd.concat([pos, neg], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced train shape:", train_bal.shape)

Balanced train shape: (612537, 119)


In [24]:
train_df, valid_df = train_test_split(
    train_bal,
    test_size=0.1,
    random_state=42,
    stratify=train_bal[label_col]
)
print("Train split shape:", train_df.shape)
print("Valid split shape:", valid_df.shape)

Train split shape: (551283, 119)
Valid split shape: (61254, 119)


In [25]:
exclude_cols = {'ID', 'seq'}

for df in (train_df, valid_df, test_df):
    drop_exist = [c for c in exclude_cols if c in df.columns]
    if drop_exist:
        df.drop(columns=drop_exist, inplace=True, errors='ignore')

In [26]:
def current_feature_cols(df_list, label):
    cols = set()
    for d in df_list:
        cols |= set(d.columns)
    cols = [c for c in sorted(cols) if c != label]
    return cols

feature_cols = current_feature_cols([train_df, valid_df, test_df], label_col)

In [27]:
def is_numeric(series: pd.Series) -> bool:
    return pd.api.types.is_numeric_dtype(series)

def infer_dtype(col: str) -> str:
    for df in (train_df, valid_df, test_df):
        if col in df.columns:
            return 'numeric' if is_numeric(df[col]) else 'categorical'
    return 'categorical'

feat_types = {c: infer_dtype(c) for c in feature_cols}

In [28]:
def build_vocab(col: str):
    uniques = set()
    for df in (train_df, valid_df, test_df):
        if col in df.columns:
            vals = df[col].astype('string').fillna('<NA>').unique().tolist()
            uniques.update(vals)
    vocab = sorted(list(uniques))
    return {v: i+1 for i, v in enumerate(vocab)}  # 0은 padding

cat_mappings = {c: build_vocab(c) for c, t in feat_types.items() if t == 'categorical'}

In [29]:
def encode_inplace(df: pd.DataFrame, has_label: bool):
    if has_label and label_col in df.columns:
        df[label_col] = pd.to_numeric(df[label_col], errors='coerce').fillna(0).astype('float32')
    for c in feature_cols:
        if feat_types[c] == 'categorical':
            if c in df.columns:
                df[c] = df[c].astype('string').fillna('<NA>').map(cat_mappings[c]).fillna(0).astype('int32')
            else:
                df[c] = np.int32(0)
        else:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype('float32')
            else:
                df[c] = np.float32(0.0)
    # 열 순서 고정
    cols_order = [label_col] + feature_cols if has_label else feature_cols
    return df[cols_order]

train_enc = encode_inplace(train_df.copy(), has_label=True)
valid_enc = encode_inplace(valid_df.copy(), has_label=True)
test_enc  = encode_inplace(test_df.copy(),  has_label=False)

In [32]:
train_enc.to_parquet(out_dir / 'train.parquet', index=False)
valid_enc.to_parquet(out_dir / 'valid.parquet', index=False)
test_enc.to_parquet(out_dir / 'test.parquet',  index=False)
print("✔ 인코딩된 parquet 저장 완료:", out_dir)

✔ 인코딩된 parquet 저장 완료: /content/drive/MyDrive/toss/data/toss_ctr_v1


In [11]:
train_df.to_parquet(out_dir / 'train.parquet', index=False)
valid_df.to_parquet(out_dir / 'valid.parquet', index=False)
test_df.to_parquet(out_dir / 'test.parquet', index=False)

print("저장 완료:", out_dir)

저장 완료: /content/drive/MyDrive/toss/data/toss_ctr_v1


In [33]:
features_list = []
total_features = 0
for c in feature_cols:
    if feat_types[c] == 'categorical':
        vocab_size = len(cat_mappings[c]) + 1  # padding 포함
        features_list.append({
            c: {
                "source": "",
                "type": "categorical",
                "padding_idx": 0,
                "vocab_size": int(vocab_size)
            }
        })
        total_features += vocab_size
    else:
        features_list.append({
            c: {
                "source": "",
                "type": "numeric"
            }
        })
        # 구현 관례: numeric을 total_features에 +1로 집계(필요 시 이 한 줄 삭제)
        total_features += 1

feature_map = {
    "dataset_id": dataset_id,
    "num_fields": len(feature_cols),
    "total_features": int(total_features),
    "input_length": len(feature_cols),
    "labels": [label_col],
    "features": features_list
}


In [34]:
with open(out_dir / 'feature_map.json', 'w', encoding='utf-8') as f:
    json.dump(feature_map, f, ensure_ascii=False, indent=4)

print("✔ feature_map.json 저장 완료:", out_dir / 'feature_map.json')
print("   num_fields =", feature_map['num_fields'],
      " total_features =", feature_map['total_features'])

✔ feature_map.json 저장 완료: /content/drive/MyDrive/toss/data/toss_ctr_v1/feature_map.json
   num_fields = 117  total_features = 178
