In [1]:
import random
import os
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
SEED = 42

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# PCA 재구성오차를 이용한 피쳐엔지니어링
- 전체 데이터를 축소했다가 복원했을 떄의 잔차를 변수로 추가하는 기법
- 도메인지식이 없을 때 유용

## 사용법
- 클래스갯수에 따라 반복문 범위를 지정(ex. 이진분류면 range(2))
- 어느 정도의 분산을 보존할건지 결정(n_components)

In [None]:
from sklearn.decomposition import PCA
def pca_features(train,test):
    train, test = train.copy(), test.copy()
    cols = [col for col in train.columns if col.startswith("X_")]

    scaler = StandardScaler()
    train[cols] = scaler.fit_transform(train[cols])
    test[cols] = scaler.transform(test[cols])
    train_list = []
    test_list = []
    for i in range(21): # 클래스갯수에 따라 변경
        mask = train["target"] == i
        decomposition = PCA(n_components=0.9, svd_solver='full', random_state=42)
        decomposition.fit(train.loc[mask,cols])

        x = decomposition.transform(train[cols])
        x = decomposition.inverse_transform(x)
        train_list.append(
            (train[cols] - x).add_prefix(f"pca_res_{i}_")
        )
        x = decomposition.transform(test[cols])
        x = decomposition.inverse_transform(x)
        test_list.append(
            (test[cols] - x).add_prefix(f"pca_res_{i}_")
        )

    return pd.concat(train_list, axis=1), pd.concat(test_list, axis=1)

In [9]:
train_features, test_features = pca_features(train,test)
train_features.shape, test_features.shape

((21693, 1092), (15004, 1092))

In [10]:
train = pd.concat([train,train_features],axis=1)
test = pd.concat([test,test_features],axis=1)
train.shape, test.shape

((21693, 1146), (15004, 1145))

In [11]:
drop_cols = ["ID","target"]
target = train["target"].to_numpy()
train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols, errors="ignore")
train.shape, test.shape, target.shape

((21693, 1144), (15004, 1144), (21693,))