#### 【 HIGH CARDINALITY 】
- 범주형 컬럼/피쳐에 고유한 값 종류가 많고 중복은 적은 경우를 이르는 말
- ML에서 발생하는 문제
    * 차원의 저주 => 학습이 불안정/성능 저하  ★ 차원은 피쳐 개수
    * 과대적합 가능성 높아짐
- 범주형 => 수치화 과정에서 OneHot인코딩 시 발생하기도 함

[1] 모듈 로딩 및 데이터 준비<hr>

In [77]:
## ========================================================================
## 모듈 로딩
## ========================================================================
## 일반 모듈
import pandas as pd
import numpy as np

## ML관련 모듈들
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, LabelEncoder


In [78]:
## ========================================================================
## 0. 공통 데이터 & Train/Test 분리
## ========================================================================
df = pd.DataFrame({
    "job": [
        "office", "office", "office", "office",
        "engineer", "engineer", "engineer", "engineer",
        "teacher", "teacher", "teacher", "teacher"
    ],
    "city": [
        "Seoul", "Seoul", "Busan", "Busan",
        "Seoul", "Busan", "Seoul", "Busan",
        "Incheon", "Incheon", "Seoul", "Busan"
    ],
    "age": [25, 30, 45, 38, 40, 42, 36, 33, 28, 39, 41, 35],
    "high_income": [0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0]
})

# 기본정보 확인
df.info()


# 피쳐와 타겟 컬럼 설정
X = df[df.columns[:-1]]  ## df[["job", "city", "age"]]
y = df[df.columns[-1]]   ## df["high_income"]

# 범주형/수치형 컬럼이름 리스트
cat_cols = X.select_dtypes(include='object').columns   ##["job", "city"]
num_cols = X.select_dtypes(include='number').columns   ##["age"]
print(f'cat_cols => {cat_cols},   num_cols => {num_cols}')


# 학습용/테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# 학습용/테스트용 범주형과 수치형 데이터 복사 
X_train_cat = X_train[cat_cols].copy()
X_test_cat  = X_test[cat_cols].copy()

X_train_num = X_train[num_cols].copy()
X_test_num  = X_test[num_cols].copy()

print("=== 원본 Train 데이터 ===")
print(X_train, "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job          12 non-null     object
 1   city         12 non-null     object
 2   age          12 non-null     int64 
 3   high_income  12 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 516.0+ bytes
cat_cols => Index(['job', 'city'], dtype='object'),   num_cols => Index(['age'], dtype='object')
=== 원본 Train 데이터 ===
         job     city  age
1     office    Seoul   30
5   engineer    Busan   42
10   teacher    Seoul   41
11   teacher    Busan   35
9    teacher  Incheon   39
0     office    Seoul   25
7   engineer    Busan   33
2     office    Busan   45
3     office    Busan   38 



In [79]:
# ==================================================
# 1. One-Hot Encoding
# ==================================================
# -> 인스턴스 생성
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# -> 학습용데이터로 인코더 생성 + 변환
X_train_cat_ohe = ohe.fit_transform(X_train_cat)

# -> 테스트용 데이터 변환
X_test_cat_ohe  = ohe.transform(X_test_cat)

# -> 컬럼명 설정 
ohe_feature_names = ohe.get_feature_names_out()

# -> 인코딩된 DF와 기존 DF 연결
X_train_cat_ohe = pd.DataFrame( X_train_cat_ohe, 
                                index=X_train.index, 
                                columns=ohe_feature_names)
X_test_cat_ohe = pd.DataFrame(  X_test_cat_ohe, 
                                index=X_test.index, 
                                columns=ohe_feature_names )

X_train_ohe = pd.concat([X_train_cat_ohe, X_train_num], axis=1)
X_test_ohe  = pd.concat([X_test_cat_ohe,  X_test_num],  axis=1)

print("=== [1] One-Hot Encoding ===")
print("Original shape:", X.shape, end=' ====> 인코딩 후 ')
print("Train shape:", X_train_ohe.shape, end='  ')
print("Test  shape:", X_test_ohe.shape)

display(X.head(2), X_train_ohe.head(2))

=== [1] One-Hot Encoding ===
Original shape: (12, 3) ====> 인코딩 후 Train shape: (9, 7)  Test  shape: (3, 7)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job_engineer,job_office,job_teacher,city_Busan,city_Incheon,city_Seoul,age
1,0.0,1.0,0.0,0.0,0.0,1.0,30
5,1.0,0.0,0.0,1.0,0.0,0.0,42


In [80]:
# ==================================================
# 2. Target Encoding (scikit-learn TargetEncoder)
# -  클래스 분포를 보고 cv 결정 (여기서는 min_count=3 → cv=3)
# -  예: "high_income": [0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0]
# ==================================================
## -> 타겟 컬럼에 따라 범주형 인코딩 
counts    = y_train.value_counts()
min_count = counts.min()
cv_te     = min(5, int(min_count))
print("TargetEncoder용 y_train 분포:\n", counts)
print("선택된 cv:", cv_te, "\n")


## -> 타겟인코더 인스턴스 생성
te = TargetEncoder(cv=cv_te, random_state=42)

## -> 훈련용 인코더 생성 및 테스트용 데이터에 인코딩 적용
## -> fit(2D_범주형, 1D_타겟컬럼)
X_train_cat_te_np = te.fit_transform(X_train_cat, y_train)

## -> 테스트용 데이터에도 인코딩 변환 
X_test_cat_te_np  = te.transform(X_test_cat)


## 인코딩된 범주형 피쳐와 나머지 피쳐들 하나로 합치기
te_col_names = [f"{c}_te" for c in cat_cols]

X_train_cat_te = pd.DataFrame(  X_train_cat_te_np, 
                                index=X_train.index, 
                                columns=te_col_names )

X_test_cat_te = pd.DataFrame(   X_test_cat_te_np, 
                                index=X_test.index, 
                                columns=te_col_names )

X_train_te = pd.concat([X_train_cat_te, X_train_num], axis=1)
X_test_te  = pd.concat([X_test_cat_te,  X_test_num],  axis=1)


print("=== [2] Target Encoding ===")
print("Original shape:", X.shape, end=' ====> 인코딩 후 ')
print("Train shape:", X_train_te.shape, end='  ')
print("Test  shape:", X_test_te.shape)

display(X.head(2), X_train_te.head(2))


TargetEncoder용 y_train 분포:
 high_income
1    6
0    3
Name: count, dtype: int64
선택된 cv: 3 

=== [2] Target Encoding ===
Original shape: (12, 3) ====> 인코딩 후 Train shape: (9, 3)  Test  shape: (3, 3)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job_te,city_te,age
1,0.666667,0.56,30
5,1.0,0.735484,42


In [81]:
# ==================================================
# 3. Frequency Encoding (빈도 인코딩)
#    - 각 범주 → 등장 비율(or 횟수) 로 치환
# ==================================================
X_train_cat_freq = X_train_cat.copy()
X_test_cat_freq  = X_test_cat.copy()

freq_maps = {}
for col in cat_cols:
    # 범주형 고유값별 비율 계산
    vc = X_train_cat[col].value_counts(normalize=True)  
    
    # 비율값을 각 고유값 인코딩의 수치값으로 사용 
    freq_maps[col] = vc
    X_train_cat_freq[col] = X_train_cat[col].map(vc).fillna(0)
    X_test_cat_freq[col]  = X_test_cat[col].map(vc).fillna(0)
    
## 인코딩된 범주형 피쳐들과 다른 피쳐들 연결
X_train_freq = pd.concat([X_train_cat_freq, X_train_num], axis=1)
X_test_freq  = pd.concat([X_test_cat_freq,  X_test_num],  axis=1)


print("=== [3] Frequency Encoding ===")
print("Original shape:", X.shape, end=' ====> 인코딩 후 ')
print("Train shape:", X_train_freq.shape, end='  ')
print("Test  shape:", X_test_freq.shape)

display(X.head(2), X_test_freq.head(2))

=== [3] Frequency Encoding ===
Original shape: (12, 3) ====> 인코딩 후 Train shape: (9, 3)  Test  shape: (3, 3)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job,city,age
8,0.333333,0.111111,28
4,0.222222,0.333333,40


In [82]:
# ==================================================
# 4. Binary Encoding (LabelEncoder + 비트 분해)
#     범주 → LabelEncoder로 정수 → 이진수로 분해 → 여러 컬럼으로
# ==================================================
X_train_cat_bin = X_train_cat.copy()
X_test_cat_bin  = X_test_cat.copy()

for col in cat_cols:
    # 1차. 정수 인코딩 
    le = LabelEncoder()
    train_codes = le.fit_transform(X_train_cat_bin[col])
    test_codes  = le.transform(X_test_cat_bin[col])

    # 2차. 이진수 분해 
    n_classes = len(le.classes_)
    n_bits    = int(np.ceil(np.log2(max(n_classes, 2))))  # 최소 1비트

    # 비트 연산 
    for bit in range(n_bits):
        bit_col = f"{col}_bit{bit}"
        X_train_cat_bin[bit_col] = (train_codes >> bit) & 1
        X_test_cat_bin[bit_col]  = (test_codes  >> bit) & 1

    # 원래 범주형 컬럼 제거 (선택 사항)
    X_train_cat_bin = X_train_cat_bin.drop(columns=[col])
    X_test_cat_bin  = X_test_cat_bin.drop(columns=[col])

X_train_bin = pd.concat([X_train_cat_bin, X_train_num], axis=1)
X_test_bin  = pd.concat([X_test_cat_bin,  X_test_num],  axis=1)


print("=== [4] Binary Encoding ===")
print("Original shape:", X.shape, end=' ====> 인코딩 후 ')
print("Train shape:", X_train_bin.shape, end='  ')
print("Test  shape:", X_test_bin.shape)

display(X.head(2), X_train_bin.head(2))



=== [4] Binary Encoding ===
Original shape: (12, 3) ====> 인코딩 후 Train shape: (9, 5)  Test  shape: (3, 5)


Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30


Unnamed: 0,job_bit0,job_bit1,city_bit0,city_bit1,age
1,1,0,0,1,30
5,0,0,0,0,42


In [89]:
## ====================================================================== 
## 
## ======================================================================
print("========== [범주형 원  본] ==========")
display(X.head(2))
print("========== [범주형 수치화] ==========")
display("원핫_인코딩", X_train_ohe.head(2))
display("타겟_인코딩", X_train_te.head(2))
display("빈도_인코딩", X_train_freq.head(2))
display("이진_인코딩", X_train_bin.head(2))



Unnamed: 0,job,city,age
0,office,Seoul,25
1,office,Seoul,30




'원핫_인코딩'

Unnamed: 0,job_engineer,job_office,job_teacher,city_Busan,city_Incheon,city_Seoul,age
1,0.0,1.0,0.0,0.0,0.0,1.0,30
5,1.0,0.0,0.0,1.0,0.0,0.0,42


'타겟_인코딩'

Unnamed: 0,job_te,city_te,age
1,0.666667,0.56,30
5,1.0,0.735484,42


'빈도_인코딩'

Unnamed: 0,job,city,age
1,0.444444,0.333333,30
5,0.222222,0.555556,42


'이진_인코딩'

Unnamed: 0,job_bit0,job_bit1,city_bit0,city_bit1,age
1,1,0,0,1,30
5,0,0,0,0,42
