In [71]:
# from google.colab import drive
# drive.mount('/content/drive')

# v3.2

## 데이터 불러오기

In [72]:
# %pip install category_encoders

In [73]:
# %pip install optuna

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import category_encoders as ce
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel

In [75]:
DATA_PATH = "/content/drive/MyDrive/멋쟁이사자처럼/Final_Project/dataset/"
SEED = 42

In [76]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_v3.2_1107.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_common_v3.2_1107.csv")
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv")
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv")

train_ft.shape, test_ft.shape, train_target.shape, submit.shape

((14940, 1455), (12225, 1455), (14940, 2), (12225, 2))

## 결측치 처리

In [77]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 피처 공학 및 스케일링

In [78]:
# ID 제거
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 1454), (12225, 1454))

In [79]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_대분류,22
주구매_중분류,238


In [80]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft["주구매지점"])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft["주구매지점"])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 1458), (12225, 1458))

In [81]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft["주구매_대분류_cnt"] = enc.fit_transform(train_ft[["주구매_대분류"]])
test_ft["주구매_대분류_cnt"] = enc.transform(test_ft[["주구매_대분류"]])

train_ft.shape, test_ft.shape

((14940, 1460), (12225, 1460))

In [82]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 1457), (12225, 1457))

- 가중치 적용

In [83]:
train_ft["18시_21시_구매비율"] = train_ft["18시_21시_구매비율"] * 3
train_ft["18시_21시_구매횟수"] = train_ft["18시_21시_구매횟수"] * 3

test_ft["18시_21시_구매비율"] = test_ft["18시_21시_구매비율"] * 3
test_ft["18시_21시_구매횟수"] = test_ft["18시_21시_구매횟수"] * 3

In [84]:
scaler = StandardScaler()

train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,평일방문비율,주말방문횟수,평일방문횟수,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,...,공휴일_대분류_영플라자_구매횟수,공휴일_대분류_잡화_구매횟수,공휴일_대분류_케주얼_구두_아동_구매횟수,공휴일_대분류_패션잡화_구매횟수,주구매지점_1,주구매지점_2,주구매지점_3,주구매지점_4,주구매_중분류_cnt,주구매_대분류_cnt
0,-0.369867,0.002987,0.257728,-0.257728,-0.230862,-0.390544,-1.029777,0.001191,0.838272,0.338186,...,-0.162079,-0.286379,-0.209907,-0.213001,1.654066,-0.623175,-0.612523,-0.468181,-0.741478,-1.31525
1,0.14411,-0.356452,-1.008554,1.008554,-0.619841,0.412809,0.323951,-0.390607,0.620171,-0.552996,...,2.389509,-0.286379,-0.209907,4.161199,-0.604571,1.604686,-0.612523,-0.468181,-0.448069,-0.626416
2,1.943028,-0.869935,0.036742,-0.036742,1.616788,1.926821,0.798943,-0.514333,-0.304527,-0.059266,...,-0.162079,-0.286379,-0.209907,-0.213001,-0.604571,-0.623175,1.632592,-0.468181,-0.869072,1.064726
3,3.793345,-1.02398,-0.080558,0.080558,3.172703,4.429574,0.420933,-0.327474,0.008592,-0.135636,...,10.044273,-0.286379,1.301424,-0.213001,1.654066,-0.623175,-0.612523,-0.468181,1.515861,-0.248364
4,0.452496,-0.613193,0.302875,-0.302875,0.83883,0.5673,-0.752532,1.70741,-0.130285,-0.821561,...,-0.162079,4.377687,4.324087,-0.213001,1.654066,-0.623175,-0.612523,-0.468181,-0.847712,1.064726


In [85]:
target = train_target["target"]

## cv 점수 확인해보기

In [31]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

params = {'n_estimators': 800,
 'learning_rate': 0.04036413044768581,
 'max_depth': 4,
 'min_child_weight': 7,
 'subsample': 0.7505214930635562,
 'colsample_bytree': 0.6290102054237857,
 'gamma': 0.648553153047272}

# F1 매크로 스코어와 모델을 저장할 리스트
scores = []
models = []

# Stratified K-Fold 교차 검증 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증 루프
for tri, vai in cv.split(train_ft, target):
    # 학습 데이터와 검증 데이터 분리
    x_train = train_ft.iloc[tri]
    y_train = target.iloc[tri]
    x_valid = train_ft.iloc[vai]
    y_valid = target.iloc[vai]

    # 모델 초기화 및 학습
    model = XGBClassifier(**params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    # 모델 저장
    models.append(model)

    # 예측 및 F1 매크로 스코어 계산
    pred = model.predict(x_valid)
    score = f1_score(y_valid, pred, average='macro')
    scores.append(score)

# F1 매크로 스코어의 평균 출력
print("Mean F1 Macro Score:", np.mean(scores))

Mean F1 Macro Score: 0.7184388251001332


## 피처 셀렉션 - SelectFromModel

In [22]:
xgb = XGBClassifier(random_state=42) # 특성 선택을 하기 위한 모델
model = LogisticRegression(random_state=42) # 학습용 모델

fs = SelectFromModel(xgb) # 특성 선택에 사용하기 위한 모델 객체를 전달해줘야함.
x = fs.fit_transform(train_ft, target) # 특성 선택이 완료된 입력 데이터가 ndarray 로 반환
scores = cross_val_score(model, x, target, cv=cv, scoring="f1_macro", n_jobs=-1)
scores.mean()

0.710522033895816

In [86]:
rfc = RandomForestClassifier(random_state=42) # 특성 선택을 하기 위한 모델
model = LogisticRegression(random_state=42) # 학습용 모델

fs = SelectFromModel(rfc) # 특성 선택에 사용하기 위한 모델 객체를 전달해줘야함.
x = fs.fit_transform(train_ft, target) # 특성 선택이 완료된 입력 데이터가 ndarray 로 반환
scores = cross_val_score(model, x, target, cv=cv, scoring="f1_macro", n_jobs=-1)
scores.mean()

0.7136594284947952

In [87]:
best_cols = fs.get_feature_names_out()
best_cols

array(['내점일수', '구매주기', '주말방문비율', '평일방문비율', '주말방문횟수', '평일방문횟수', '봄_구매비율',
       '여름_구매비율', '가을_구매비율', '겨울_구매비율', '주구매요일', '주구매_월', '주구매_시간대',
       '일별평균구매횟수', '거래개월수', '9시_12시_구매비율', '12시_15시_구매비율', '15시_18시_구매비율',
       '18시_21시_구매비율', '9시_12시_구매횟수', '12시_15시_구매횟수', '15시_18시_구매횟수',
       '18시_21시_구매횟수', '월초_구매비율', '월말_구매비율', '월초_구매횟수', '월말_구매횟수',
       '웨딩성수기_구매비율', '웨딩성수기_구매횟수', '1월_구매비율', '2월_구매비율', '3월_구매비율',
       '4월_구매비율', '5월_구매비율', '6월_구매비율', '7월_구매비율', '8월_구매비율', '9월_구매비율',
       '10월_구매비율', '11월_구매비율', '12월_구매비율', '1월_구매횟수', '2월_구매횟수',
       '3월_구매횟수', '4월_구매횟수', '5월_구매횟수', '6월_구매횟수', '7월_구매횟수', '8월_구매횟수',
       '9월_구매횟수', '10월_구매횟수', '11월_구매횟수', '12월_구매횟수', '1월_방문횟수',
       '2월_방문횟수', '3월_방문횟수', '4월_방문횟수', '5월_방문횟수', '6월_방문횟수', '7월_방문횟수',
       '8월_방문횟수', '9월_방문횟수', '10월_방문횟수', '11월_방문횟수', '12월_방문횟수',
       '공휴일_구매비율', '여름휴가_구매비율', '연말_구매비율', '공휴일_구매횟수', '여름휴가_구매횟수',
       '연말_구매횟수', '1월_총구매금액', '2월_총구매금액', '3월_총구매금액', '4월_총구매금액',
       '5월_총구매금액', '6월_총구매금액',

In [93]:
len(best_cols)

362

## Logistic Regression

- XGBoost로 피처 셀렉션

In [24]:
def objective(trial):
    hp = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),  # 규제 강도
        "penalty": "l2",  # L2 규제로 고정
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear", "sag", "saga"]),
        "max_iter": trial.suggest_int("max_iter", 50, 500, step=50)  # 반복 횟수
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = LogisticRegression(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft[best_cols], target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=100)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-07 05:27:13,236] A new study created in memory with name: no-name-1ca91426-d88d-4c52-a1c2-dca06e263628
[I 2024-11-07 05:27:20,428] Trial 0 finished with value: 0.7148251014779604 and parameters: {'C': 0.0074593432857265485, 'solver': 'lbfgs', 'max_iter': 100}. Best is trial 0 with value: 0.7148251014779604.
[I 2024-11-07 05:27:22,802] Trial 1 finished with value: 0.7003528963631694 and parameters: {'C': 0.00019517224641449495, 'solver': 'lbfgs', 'max_iter': 500}. Best is trial 0 with value: 0.7148251014779604.
[I 2024-11-07 05:30:40,515] Trial 2 finished with value: 0.713223791930233 and parameters: {'C': 1.452824663751602, 'solver': 'saga', 'max_iter': 300}. Best is trial 0 with value: 0.7148251014779604.
[I 2024-11-07 05:30:55,460] Trial 3 finished with value: 0.7138479781388632 and parameters: {'C': 0.01444525102276306, 'solver': 'liblinear', 'max_iter': 200}. Best is trial 0 with value: 0.7148251014779604.
[I 2024-11-07 05:30:59,283] Trial 4 finished with value: 0.713041

In [25]:
# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
print("Best F1 Score:", study.best_value)

Best hyperparameters: {'C': 0.0004732051341895587, 'solver': 'liblinear', 'max_iter': 50}
Best F1 Score: 0.7264344050715487


In [26]:
# 모델 학습
model = LogisticRegression(random_state=SEED, **study.best_params)
model.fit(train_ft[best_cols],target)

pred = model.predict(test_ft[best_cols])

submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v3.2_Logistic.csv",index=False)

- RandomForestClassifier로 피처 셀렉션

In [91]:
def objective(trial):
    hp = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),  # 규제 강도
        "penalty": "l2",  # L2 규제로 고정
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear", "sag", "saga"]),
        "max_iter": trial.suggest_int("max_iter", 50, 500, step=50)  # 반복 횟수
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = LogisticRegression(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft[best_cols], target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=100)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-07 07:26:28,930] A new study created in memory with name: no-name-346ffd31-329a-44d9-a572-ceddb8cde8fa
[I 2024-11-07 07:26:37,971] Trial 0 finished with value: 0.7132166416534913 and parameters: {'C': 0.0074593432857265485, 'solver': 'lbfgs', 'max_iter': 100}. Best is trial 0 with value: 0.7132166416534913.
[I 2024-11-07 07:26:40,791] Trial 1 finished with value: 0.6998486096924318 and parameters: {'C': 0.00019517224641449495, 'solver': 'lbfgs', 'max_iter': 500}. Best is trial 0 with value: 0.7132166416534913.
[I 2024-11-07 07:28:40,864] Trial 2 finished with value: 0.7127738003485069 and parameters: {'C': 1.452824663751602, 'solver': 'saga', 'max_iter': 300}. Best is trial 0 with value: 0.7132166416534913.
[I 2024-11-07 07:28:50,534] Trial 3 finished with value: 0.7133757872595219 and parameters: {'C': 0.01444525102276306, 'solver': 'liblinear', 'max_iter': 200}. Best is trial 3 with value: 0.7133757872595219.
[I 2024-11-07 07:28:53,077] Trial 4 finished with value: 0.71304

In [92]:
# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
print("Best F1 Score:", study.best_value)

Best hyperparameters: {'C': 0.0005149778842731043, 'solver': 'liblinear', 'max_iter': 150}
Best F1 Score: 0.7216853431847031


In [None]:
# 모델 학습
model = LogisticRegression(random_state=SEED, **study.best_params)
model.fit(train_ft[best_cols],target)

pred = model.predict(test_ft[best_cols])

submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v3.2.02_Logistic.csv",index=False)

## LDA

- XGBoost로 피처 셀렉션

In [27]:
def objective(trial):
    # Optuna가 선택할 수 있는 하이퍼파라미터 정의
    solver = trial.suggest_categorical('solver', ['svd', 'lsqr', 'eigen'])

    # 'lsqr'와 'eigen'에서만 shrinkage 하이퍼파라미터가 사용되므로 조건 추가
    if solver in ['lsqr', 'eigen']:
        shrinkage = trial.suggest_float('shrinkage', 0.0, 1.0)
    else:
        shrinkage = None

    # n_components 하이퍼파라미터: 차원 수를 1부터 X의 특성 개수까지
    n_components = trial.suggest_int('n_components', 1, min(train_ft[best_cols].shape[1], len(np.unique(target)) - 1))

    # LDA 모델 정의
    lda = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage, n_components=n_components)

    score = cross_val_score(lda, train_ft[best_cols], target, cv=5, scoring="f1_macro", n_jobs=-1).mean()
    return score

# Optuna study 설정 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[I 2024-11-07 06:05:05,128] A new study created in memory with name: no-name-19e381e6-7522-48f8-a012-01bd1830f6af
[I 2024-11-07 06:05:09,999] Trial 0 finished with value: 0.7149756352852411 and parameters: {'solver': 'lsqr', 'shrinkage': 0.03460158595146556, 'n_components': 1}. Best is trial 0 with value: 0.7149756352852411.
[I 2024-11-07 06:05:19,657] Trial 1 finished with value: 0.7121982625954939 and parameters: {'solver': 'svd', 'n_components': 1}. Best is trial 0 with value: 0.7149756352852411.
[I 2024-11-07 06:05:33,013] Trial 2 finished with value: 0.7121982625954939 and parameters: {'solver': 'svd', 'n_components': 1}. Best is trial 0 with value: 0.7149756352852411.
[I 2024-11-07 06:05:39,825] Trial 3 finished with value: 0.7185593520883714 and parameters: {'solver': 'eigen', 'shrinkage': 0.764276345608098, 'n_components': 1}. Best is trial 3 with value: 0.7185593520883714.
[I 2024-11-07 06:05:50,272] Trial 4 finished with value: 0.7154090547423242 and parameters: {'solver': 'e

In [28]:
print("Best hyperparameters:", study.best_params)
print("Best F1 Score:", study.best_value)

Best hyperparameters: {'solver': 'lsqr', 'shrinkage': 0.5686239036737244, 'n_components': 1}
Best F1 Score: 0.7234289145781864


In [29]:
model = LinearDiscriminantAnalysis(**study.best_params)
model.fit(train_ft[best_cols],target)

pred = model.predict(test_ft[best_cols])

submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v3.2_LDA.csv",index=False)

- RandomForestClassifier로 피처 셀렉션

In [88]:
def objective(trial):
    # Optuna가 선택할 수 있는 하이퍼파라미터 정의
    solver = trial.suggest_categorical('solver', ['svd', 'lsqr', 'eigen'])

    # 'lsqr'와 'eigen'에서만 shrinkage 하이퍼파라미터가 사용되므로 조건 추가
    if solver in ['lsqr', 'eigen']:
        shrinkage = trial.suggest_float('shrinkage', 0.0, 1.0)
    else:
        shrinkage = None

    # n_components 하이퍼파라미터: 차원 수를 1부터 X의 특성 개수까지
    n_components = trial.suggest_int('n_components', 1, min(train_ft[best_cols].shape[1], len(np.unique(target)) - 1))

    # LDA 모델 정의
    lda = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage, n_components=n_components)

    score = cross_val_score(lda, train_ft[best_cols], target, cv=5, scoring="f1_macro", n_jobs=-1).mean()
    return score

# Optuna study 설정 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-11-07 07:11:52,719] A new study created in memory with name: no-name-113d28c3-7586-4492-8b9c-bcc58e4639e6
[I 2024-11-07 07:11:54,929] Trial 0 finished with value: 0.7175376197357244 and parameters: {'solver': 'lsqr', 'shrinkage': 0.6323421437249352, 'n_components': 1}. Best is trial 0 with value: 0.7175376197357244.
[I 2024-11-07 07:11:57,093] Trial 1 finished with value: 0.7174755588949672 and parameters: {'solver': 'lsqr', 'shrinkage': 0.6309611804845232, 'n_components': 1}. Best is trial 0 with value: 0.7175376197357244.
[I 2024-11-07 07:11:59,808] Trial 2 finished with value: 0.7071201514749401 and parameters: {'solver': 'eigen', 'shrinkage': 0.8658086328697074, 'n_components': 1}. Best is trial 0 with value: 0.7175376197357244.
[I 2024-11-07 07:12:03,177] Trial 3 finished with value: 0.7183240000546282 and parameters: {'solver': 'lsqr', 'shrinkage': 0.5852769937969394, 'n_components': 1}. Best is trial 3 with value: 0.7183240000546282.
[I 2024-11-07 07:12:07,178] Trial 4 f

In [89]:
print("Best hyperparameters:", study.best_params)
print("Best F1 Score:", study.best_value)

Best hyperparameters: {'solver': 'eigen', 'shrinkage': 0.47608072238601795, 'n_components': 1}
Best F1 Score: 0.720020728512924


In [90]:
model = LinearDiscriminantAnalysis(**study.best_params)
model.fit(train_ft[best_cols],target)

pred = model.predict(test_ft[best_cols])

submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v3.2.02_LDA.csv",index=False)