In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 데이터 경로 변수

In [2]:
DATA_PATH = "/content/drive/MyDrive/멋쟁이사자처럼/data/"
DATA_PATH

'/content/drive/MyDrive/멋쟁이사자처럼/data/'

- 시드값

In [3]:
SEED = 42

- 데이터 불러오기

In [4]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [6]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_v2.0_1101.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_v2.0_1101.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 465), (12225, 465))

# 결측치 처리

In [7]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0


In [8]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0


In [9]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [12]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 464), (12225, 464))

- 추가 피처 만들어 보기

In [None]:
cols = [ col for col in train_ft.columns if col.startswith("pivot_cnt_") ]

In [None]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

## Feature Encoding

In [13]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류_수정,7
대분류_수정_평균금액최대,7


In [14]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [15]:
import category_encoders as ce

In [16]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류_수정", "대분류_수정_평균금액최대"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류_수정", "대분류_수정_평균금액최대"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 482), (12225, 482))

In [17]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

((14940, 483), (12225, 483))

- 문자열 피처 삭제

In [18]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류_수정', '대분류_수정_평균금액최대']

In [19]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 479), (12225, 479))

In [20]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_수정_6,주구매_대분류_수정_7,대분류_수정_평균금액최대_1,대분류_수정_평균금액최대_2,대분류_수정_평균금액최대_3,대분류_수정_평균금액최대_4,대분류_수정_평균금액최대_5,대분류_수정_평균금액최대_6,대분류_수정_평균금액최대_7,주구매_중분류_cnt
0,-0.369867,0.002987,0.257728,-1.029777,0.001191,0.838272,0.338186,0.109631,-0.057297,0.01762,...,-0.378716,-0.187997,2.553208,-0.443004,-0.816155,-0.341822,-0.345596,-0.274521,-0.149122,-0.72697
1,0.14411,-0.356452,-1.008554,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.222706,1.17601,...,-0.378716,-0.187997,-0.391664,2.257315,-0.816155,-0.341822,-0.345596,-0.274521,-0.149122,-0.510078
2,1.943028,-0.869935,0.036742,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,0.277707,1.465608,...,-0.378716,-0.187997,-0.391664,2.257315,-0.816155,-0.341822,-0.345596,-0.274521,-0.149122,-0.848408
3,3.793345,-1.02398,-0.080558,0.420933,-0.327474,0.008592,-0.135636,0.109631,0.674668,1.465608,...,-0.378716,-0.187997,-0.391664,-0.443004,1.225257,-0.341822,-0.345596,-0.274521,-0.149122,1.523851
4,0.452496,-0.613193,0.302875,-0.752532,1.70741,-0.130285,-0.821561,0.693965,0.679008,0.886413,...,-0.378716,-0.187997,-0.391664,2.257315,-0.816155,-0.341822,-0.345596,-0.274521,-0.149122,-0.833157


# 정답 데이터

In [23]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# cv 점수 확인해보기

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [25]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.6010876555456812

# 모델 학습

In [26]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(train_ft,target)

# 테스트 데이터 예측

In [27]:
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [28]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [29]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,0.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,1.0
12222,test_12222,0.0
12223,test_12223,0.0


- 예측 결과를 csv 파일로 저장하여 제출

In [30]:
submit.to_csv(f"{DATA_PATH}6_v2.0_DecisonTree.csv",index=False)

# optuna 로 파라미터 튜닝하기

In [31]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [32]:
import optuna

## v2.0

In [46]:
def objective(trial):
    hp = {
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_features": trial.suggest_float("max_features", 0.6, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = DecisionTreeClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [None]:
def objective(trial):
    hp = {
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_float("max_features", 0.5, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = DecisionTreeClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [47]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=50)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-01 07:16:46,044] A new study created in memory with name: no-name-4c0f451e-c731-4bb1-a6f1-0c9a888ffbd0
[I 2024-11-01 07:16:52,711] Trial 0 finished with value: 0.6286692217541905 and parameters: {'criterion': 'entropy', 'max_depth': 16, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_features': 0.65}. Best is trial 0 with value: 0.6286692217541905.
[I 2024-11-01 07:17:04,630] Trial 1 finished with value: 0.6301880329659696 and parameters: {'criterion': 'entropy', 'max_depth': 14, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 1.0}. Best is trial 1 with value: 0.6301880329659696.
[I 2024-11-01 07:17:08,951] Trial 2 finished with value: 0.6577122763612718 and parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 0.8}. Best is trial 2 with value: 0.6577122763612718.
[I 2024-11-01 07:17:18,044] Trial 3 finished with value: 0.6239749922016665 and parameters: {'criterion': 'gini', 'max_depth': 14, 'mi

In [48]:
study.best_trial.params  # 최적의 하이퍼파라미터

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_split': 10,
 'min_samples_leaf': 11,
 'max_features': 0.7}

In [49]:
study.best_value    # 최적의 하이퍼파라미터의 검증 점수

0.6742165742275343

In [51]:
# 모델 학습
model = DecisionTreeClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft,target)

In [52]:
# 테스트 데이터 예측
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [53]:
# 예측 결과 저장
submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v2.0_DecisonTree.csv",index=False)

## v2.1

- 1차 하이퍼파라미터 수정

In [54]:
def objective(trial):
    hp = {
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_float("max_features", 0.5, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = DecisionTreeClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [55]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=50)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-01 07:27:42,304] A new study created in memory with name: no-name-8ddd735a-86c3-4834-a15b-063dd9fbdaf3
[I 2024-11-01 07:27:55,758] Trial 0 finished with value: 0.638145934912522 and parameters: {'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.55}. Best is trial 0 with value: 0.638145934912522.
[I 2024-11-01 07:28:08,139] Trial 1 finished with value: 0.6452403901602878 and parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': 1.0}. Best is trial 1 with value: 0.6452403901602878.
[I 2024-11-01 07:28:15,709] Trial 2 finished with value: 0.6604490118389292 and parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 0.75}. Best is trial 2 with value: 0.6604490118389292.
[I 2024-11-01 07:28:23,503] Trial 3 finished with value: 0.646365281029741 and parameters: {'criterion': 'gini', 'max_depth': 10, 'min_

In [56]:
study.best_trial.params  # 최적의 하이퍼파라미터

{'criterion': 'entropy',
 'max_depth': 6,
 'min_samples_split': 9,
 'min_samples_leaf': 9,
 'max_features': 0.55}

In [57]:
study.best_value    # 최적의 하이퍼파라미터의 검증 점수

0.6699855238056662

In [58]:
# 모델 학습
model = DecisionTreeClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft,target)

In [59]:
# 테스트 데이터 예측
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [60]:
# 예측 결과 저장
submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v2.1_DecisonTree.csv",index=False)

## v2.2

- 2차 하이퍼파라미터 수정 > 지니계수

In [68]:
def objective(trial):
    hp = {
        "criterion": trial.suggest_categorical("criterion", ["gini"]),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_float("max_features", 0.5, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = DecisionTreeClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [69]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=50)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-01 08:50:05,710] A new study created in memory with name: no-name-0584e957-a8f7-49fd-bd17-784192a540cf
[I 2024-11-01 08:50:11,445] Trial 0 finished with value: 0.6591900374567821 and parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': 0.8}. Best is trial 0 with value: 0.6591900374567821.
[I 2024-11-01 08:50:15,177] Trial 1 finished with value: 0.6663940794593122 and parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.95}. Best is trial 1 with value: 0.6663940794593122.
[I 2024-11-01 08:50:23,730] Trial 2 finished with value: 0.6404990819798537 and parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': 1.0}. Best is trial 1 with value: 0.6663940794593122.
[I 2024-11-01 08:50:29,894] Trial 3 finished with value: 0.6413421537719733 and parameters: {'criterion': 'gini', 'max_depth': 13, 'min_sampl

In [70]:
study.best_trial.params  # 최적의 하이퍼파라미터

{'criterion': 'gini',
 'max_depth': 6,
 'min_samples_split': 12,
 'min_samples_leaf': 3,
 'max_features': 0.8500000000000001}

In [71]:
study.best_value    # 최적의 하이퍼파라미터의 검증 점수

0.6677739286713253

In [72]:
# 모델 학습
model = DecisionTreeClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft,target)

In [73]:
# 테스트 데이터 예측
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [74]:
# 예측 결과 저장
submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v2.2_DecisonTree.csv",index=False)

## v2.3

- 3차 하이퍼파라미터 수정 > 엔트로피 계수

In [75]:
def objective(trial):
    hp = {
        "criterion": trial.suggest_categorical("criterion", ["entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_float("max_features", 0.5, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = DecisionTreeClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [76]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=50)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-01 08:55:37,455] A new study created in memory with name: no-name-d8d92b23-493b-4ff1-90dc-e35297f2022c
[I 2024-11-01 08:55:42,221] Trial 0 finished with value: 0.6627602697217541 and parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': 0.8}. Best is trial 0 with value: 0.6627602697217541.
[I 2024-11-01 08:55:48,702] Trial 1 finished with value: 0.6661589907689445 and parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.95}. Best is trial 1 with value: 0.6661589907689445.
[I 2024-11-01 08:55:56,221] Trial 2 finished with value: 0.6452403901602878 and parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': 1.0}. Best is trial 1 with value: 0.6661589907689445.
[I 2024-11-01 08:56:03,804] Trial 3 finished with value: 0.6429678213862976 and parameters: {'criterion': 'entropy', 'max_depth': 13

In [77]:
study.best_trial.params  # 최적의 하이퍼파라미터

{'criterion': 'entropy',
 'max_depth': 6,
 'min_samples_split': 15,
 'min_samples_leaf': 4,
 'max_features': 0.95}

In [78]:
study.best_value    # 최적의 하이퍼파라미터의 검증 점수

0.6717780633636928

In [79]:
# 모델 학습
model = DecisionTreeClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft,target)

In [80]:
# 테스트 데이터 예측
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [81]:
# 예측 결과 저장
submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v2.3_DecisonTree.csv",index=False)

## 심심해서한 LGBM

- LGBM도 궁금한데

In [61]:
from lightgbm import LGBMClassifier

def objective(trial):
    hp = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),               # 리프 노드의 수
        "max_depth": trial.suggest_int("max_depth", 3, 15),                   # 최대 트리 깊이
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),  # 학습률
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),      # 부스팅 단계 수
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),  # 최소 데이터 수
        "subsample": trial.suggest_float("subsample", 0.6, 1.0, step=0.1),    # 샘플링 비율
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.1),  # 피처 샘플링 비율
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),  # L1 규제
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True) # L2 규제
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = LGBMClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [62]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체 모델 역할은 sampler 객체

# study 객체
study = optuna.create_study(
    direction="maximize",
    sampler = sampler
)

study.optimize(objective, n_trials=50)   # optimze 메서드에 콜백 함수(objective) 전달

[I 2024-11-01 07:34:28,742] A new study created in memory with name: no-name-e3820db8-e3e8-4104-ad1d-63b3f4170e41
[I 2024-11-01 07:36:06,721] Trial 0 finished with value: 0.7090306219145696 and parameters: {'num_leaves': 69, 'max_depth': 15, 'learning_rate': 0.1205712628744377, 'n_estimators': 600, 'min_child_samples': 19, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_alpha': 0.6245760287469893, 'reg_lambda': 0.002570603566117598}. Best is trial 0 with value: 0.7090306219145696.
[I 2024-11-01 07:36:57,322] Trial 1 finished with value: 0.699175421518103 and parameters: {'num_leaves': 112, 'max_depth': 3, 'learning_rate': 0.2708160864249968, 'n_estimators': 900, 'min_child_samples': 25, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_alpha': 5.472429642032198e-06, 'reg_lambda': 0.00052821153945323}. Best is trial 0 with value: 0.7090306219145696.
[I 2024-11-01 07:37:36,520] Trial 2 finished with value: 0.7150207800571889 and parameters: {'num_leaves': 76, 'max_depth': 6, 'learning_rate

In [63]:
study.best_trial.params  # 최적의 하이퍼파라미터

{'num_leaves': 20,
 'max_depth': 14,
 'learning_rate': 0.027906527991630778,
 'n_estimators': 1000,
 'min_child_samples': 58,
 'subsample': 0.8,
 'colsample_bytree': 0.7,
 'reg_alpha': 0.7747077190206921,
 'reg_lambda': 0.00961644947537403}

In [64]:
study.best_value    # 최적의 하이퍼파라미터의 검증 점수 -> 기존 0.7209보다 유의미한 증가???

0.7210577559661056

In [65]:
# 모델 학습
model = LGBMClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft,target)

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27368
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 434
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


In [66]:
# 테스트 데이터 예측
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [67]:
# 예측 결과 저장
submit["target"] = pred
submit.to_csv(f"{DATA_PATH}6_v2.0_LGBM.csv",index=False)

# AutoML