In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm


In [None]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
train

# Drop

In [None]:
train = train.drop(['l_feat_16'],axis=1)
test = test.drop(['l_feat_16'],axis=1)

train = train.drop(['l_feat_17'],axis=1)
test = test.drop(['l_feat_17'],axis=1)



# Target Encoding

In [None]:
def add_target_encoding(train, test, target_col, feature_combinations):

    global_mean = train[target_col].mean()

    for combo in feature_combinations:
        combo_name = '+'.join(combo)
        te_col = f'TE_{combo_name}'

        group_mean = (
            train
            .groupby(list(combo))[target_col]
            .mean()
            .reset_index()
            .rename(columns={target_col: te_col})
        )

        train = train.merge(group_mean, on=list(combo), how='left')
        test = test.merge(group_mean, on=list(combo), how='left')

        test[te_col] = test[te_col].fillna(global_mean)

    return train, test

feature_combinations = [
    ('age_group', 'gender'),
    ('age_group', 'hour'),
    ('hour',),
    ('inventory_id',),
]

train, test = add_target_encoding(train, test, target_col='clicked', feature_combinations=feature_combinations)

# seq

In [None]:

####################################
### 시퀀스 길이 자체 → 유저의 "활성도"를 의미
####################################
train['user_activity'] = train['seq'].str.count(',')
test['user_activity'] = test['seq'].str.count(',')

####################################
### 최근 행동의 중요성 반영 + 과거 긴 히스토리의 노이즈 제거
####################################
def extract_last_seq_features(df, col='seq'):
    """
    df['seq']에서 마지막 3개의 숫자를 추출하여
    seq_last, seq_last_1, seq_last_2 컬럼을 생성해 df에 붙임
    """
    last_values = []
    last_1_values = []
    last_2_values = []

    for seq in tqdm(df[col], desc="Extracting last seq tokens"):
        if pd.isna(seq) or seq.strip() == "":
            tokens = []
        else:
            tokens = seq.strip().split(',')

        # 마지막 값들 추출 (없으면 np.nan)
        last_values.append(int(tokens[-1]) if len(tokens) >= 1 else np.nan)
        last_1_values.append(int(tokens[-2]) if len(tokens) >= 2 else np.nan)
        last_2_values.append(int(tokens[-3]) if len(tokens) >= 3 else np.nan)
    df['seq_last'] = last_values
    df['seq_last_1'] = last_1_values
    df['seq_last_2'] = last_2_values
    return df

train = extract_last_seq_features(train)
test = extract_last_seq_features(test)

# inventory_id

# hour

In [None]:
def add_hour_sin_cos(df, hour_col='hour', period=24):

    df[f'{hour_col}_sin'] = np.sin(2 * np.pi * df[hour_col] / period)
    df[f'{hour_col}_cos'] = np.cos(2 * np.pi * df[hour_col] / period)
    return df

train['hour'] = train['hour'].astype(str).str.zfill(2).astype(int)
test['hour'] = test['hour'].astype(str).str.zfill(2).astype(int)

train = add_hour_sin_cos(train, hour_col='hour')
test = add_hour_sin_cos(test, hour_col='hour')


# day_of_week

In [None]:
def add_day_sin_cos(df, day_col='day_of_week', period=7):

    df[f'{day_col}_sin'] = np.sin(2 * np.pi * df[day_col] / period)
    df[f'{day_col}_cos'] = np.cos(2 * np.pi * df[day_col] / period)
    return df

train['day_of_week'] = train['day_of_week'].astype(str).str.zfill(2).astype(int)
test['day_of_week'] = test['day_of_week'].astype(str).str.zfill(2).astype(int)

train = add_day_sin_cos(train, day_col='day_of_week')
test = add_day_sin_cos(test, day_col='day_of_week')

# Combination

In [None]:
####################################
### gender + age_group
####################################
train['age_gender'] = train['age_group'] + "_" + train['gender']
test['age_gender'] = test['age_group'] + "_" + test['gender']


####################################
### gender + age_group은 20대 남자, 30대 남자는 유사하다 처럼 이런 스무스?한 정보를 표현할 수 없음. 따라서 피처엔지니어링
####################################
manual_groups = {
    ('1.0_1.0', '1.0_2.0'): 'grp_A',
    ('8.0_1.0', '8.0_2.0'): 'grp_B',
    ('3.0_1.0', '4.0_1.0', '5.0_1.0', '5.0_2.0'): 'grp_C',
    ('3.0_2.0', '4.0_2.0'): 'grp_D'
}
age_gender_custom_map = {}
for group_vals, group_name in manual_groups.items():
    for val in group_vals:
        age_gender_custom_map[val] = group_name
def map_age_gender_group(val):
    return age_gender_custom_map.get(val, val)  # 지정된 것만 그룹 이름, 나머지는 원본 유지
# 새 피처: age_gender_smiliarity
train['age_gender_smiliarity'] = train['age_gender'].apply(map_age_gender_group)
test['age_gender_smiliarity'] = test['age_gender'].apply(map_age_gender_group)

import numpy as np
import pandas as pd
from scipy.special import betaln, digamma

import numpy as np
import pandas as pd
from scipy.special import betaln, digamma

RARE_THRESHOLD = 1500

# 2️⃣ 설정
GROUP_SETS = [
    ["inventory_id"],
]

PRIOR_CTR = train["clicked"].mean()
PRIOR_N = 200
PRIOR_A = PRIOR_CTR * PRIOR_N
PRIOR_B = (1 - PRIOR_CTR) * PRIOR_N

# 3️⃣ 베이지안 피처 생성
for _i, _cols in enumerate(GROUP_SETS):
    _suf = "x".join(_cols)

    agg_tr = (
        train.groupby(_cols)["clicked"]
        .agg(imps="count", clicks="sum")
        .reset_index()
    )

    train = train.merge(agg_tr, on=_cols, how="left", suffixes=("", f"__agg_{_suf}"))

    a_tr = PRIOR_A + train["clicks"].astype(float)
    b_tr = PRIOR_B + (train["imps"].astype(float) - train["clicks"].astype(float))
    ab_tr = a_tr + b_tr

    train[f"unc_{_suf}_post_mean"] = a_tr / (ab_tr + 1e-12)
    train[f"unc_{_suf}_post_var"] = (a_tr * b_tr) / ((ab_tr**2) * (ab_tr + 1.0) + 1e-12)
    lnB_tr = betaln(a_tr, b_tr)
    train[f"unc_{_suf}_post_entropy"] = (
        lnB_tr
        - (a_tr - 1.0) * digamma(a_tr)
        - (b_tr - 1.0) * digamma(b_tr)
        + (ab_tr - 2.0) * digamma(ab_tr)
    )
    train[f"unc_{_suf}_eff_n"] = ab_tr
    train.drop(columns=["imps", "clicks"], inplace=True)

    # === test 병합 ===
    a_te = PRIOR_A + agg_tr["clicks"].astype(float).to_numpy()
    b_te = PRIOR_B + (agg_tr["imps"].astype(float).to_numpy() - agg_tr["clicks"].astype(float).to_numpy())
    ab_te = a_te + b_te

    agg_tr_features = agg_tr.copy()
    agg_tr_features[f"unc_{_suf}_post_mean"] = a_te / (ab_te + 1e-12)
    agg_tr_features[f"unc_{_suf}_post_var"] = (a_te * b_te) / ((ab_te**2) * (ab_te + 1.0) + 1e-12)
    lnB_te = betaln(a_te, b_te)
    agg_tr_features[f"unc_{_suf}_post_entropy"] = (
        lnB_te
        - (a_te - 1.0) * digamma(a_te)
        - (b_te - 1.0) * digamma(b_te)
        + (ab_te - 2.0) * digamma(ab_te)
    )
    agg_tr_features[f"unc_{_suf}_eff_n"] = ab_te

    test = test.merge(
        agg_tr_features[_cols + [
            f"unc_{_suf}_post_mean",
            f"unc_{_suf}_post_var",
            f"unc_{_suf}_post_entropy",
            f"unc_{_suf}_eff_n"
        ]],
        on=_cols, how="left"
    )

    prior_ab = PRIOR_A + PRIOR_B
    prior_mean = PRIOR_A / (prior_ab + 1e-12)
    prior_var = (PRIOR_A * PRIOR_B) / ((prior_ab**2) * (prior_ab + 1.0) + 1e-12)
    prior_ent = (
        betaln(PRIOR_A, PRIOR_B)
        - (PRIOR_A - 1.0) * digamma(PRIOR_A)
        - (PRIOR_B - 1.0) * digamma(PRIOR_B)
        + (prior_ab - 2.0) * digamma(prior_ab)
    )

    # 무조건 prior 값만 사용 (리키지 없음)
    fallback_val = {
        f"unc_{_suf}_post_mean": prior_mean,
        f"unc_{_suf}_post_var": prior_var,
        f"unc_{_suf}_post_entropy": prior_ent,
        f"unc_{_suf}_eff_n": prior_ab
    }

    # NaN 채우기 (머지 실패한 test 행들)
    for key in fallback_val:
        test[key] = test[key].fillna(fallback_val[key])


In [None]:
train.drop(['seq'],axis=1,inplace=True)
test.drop(['seq'],axis=1,inplace=True)

In [None]:
test.drop(['ID'],axis=1,inplace=True)

In [None]:
for col in train.select_dtypes(include=["int64"]).columns:
    train[col] = train[col].astype(np.int32)
for col in test.select_dtypes(include=["int64"]).columns:
    test[col] = test[col].astype(np.int32)
for col in train.select_dtypes(include=["float64"]).columns:
    train[col] = train[col].astype(np.float32)
for col in test.select_dtypes(include=["float64"]).columns:
    test[col] = test[col].astype(np.float32)

In [None]:
for col in train.columns:
    if(col == 'clicked'):
      continue
    if train[col].isnull().sum() > 0:
        if pd.api.types.is_categorical_dtype(train[col]):
            if 'Null' not in train[col].cat.categories:
                train[col] = train[col].cat.add_categories('Null')
                test[col] = test[col].cat.add_categories('Null')
            train[col] = train[col].fillna('Null')
            test[col] = test[col].fillna('Null')

        elif train[col].dtype == 'object':
            train[col] = train[col].fillna('Null')
            test[col] = test[col].fillna('Null')

        elif np.issubdtype(train[col].dtype, np.number):
            train[col] = train[col].fillna(-99999)
            test[col] = test[col].fillna(-99999)
for col in train.select_dtypes(include=["object"]).columns:
    train[col] = train[col].astype("category")
for col in test.select_dtypes(include=["object"]).columns:
    test[col] = test[col].astype("category")

In [None]:
train.to_parquet('train_preprocess_Final.parquet')
test.to_parquet('test_preprocess_Final.parquet')