In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 데이터 경로 변수

In [45]:
DATA_PATH = "/content/drive/MyDrive/final_pj/final_project_data/"
DATA_PATH

'/content/drive/MyDrive/final_pj/final_project_data/'

- 시드값

In [46]:
SEED = 42

- 데이터 불러오기

In [47]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [48]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_v2.0_1101.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_v2.0_1101.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 465), (12225, 465))

# 결측치 처리

In [49]:
train_ft.isnull().sum().sum()

0

In [50]:
test_ft.isnull().sum().sum()

0

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [51]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 464), (12225, 464))

## Feature Encoding

In [52]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류_수정,7
대분류_수정_평균금액최대,7


In [53]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류_수정', '대분류_수정_평균금액최대']

In [54]:
%pip install category_encoders



In [55]:
import category_encoders as ce

In [56]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[cols])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[cols])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 728), (12225, 728))

In [57]:
#enc = ce.count.CountEncoder()
#train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
#test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])
#
#train_ft.shape, test_ft.shape

- 문자열 피처 삭제

In [58]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류_수정', '대분류_수정_평균금액최대']

In [59]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 724), (12225, 724))

In [60]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [61]:
from sklearn.preprocessing import MinMaxScaler

In [62]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()

In [63]:
scaler = MinMaxScaler()
scaler.fit(train_ft)

In [64]:
train_ft[train_ft.columns] = scaler.transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_수정_5,주구매_대분류_수정_6,주구매_대분류_수정_7,대분류_수정_평균금액최대_1,대분류_수정_평균금액최대_2,대분류_수정_평균금액최대_3,대분류_수정_평균금액최대_4,대분류_수정_평균금액최대_5,대분류_수정_평균금액최대_6,대분류_수정_평균금액최대_7
0,0.041494,0.130682,0.25,0.05,0.25,0.4,0.3,0.5,0.090909,0.545455,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.082988,0.090909,0.02381,0.357143,0.166667,0.357143,0.119048,0.5,0.111111,0.909091,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.228216,0.034091,0.210526,0.464912,0.140351,0.175439,0.219298,0.0,0.115079,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.377593,0.017045,0.189573,0.379147,0.180095,0.236967,0.203791,0.5,0.14372,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.107884,0.0625,0.258065,0.112903,0.612903,0.209677,0.064516,0.666667,0.144033,0.818182,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# 정답 데이터

In [65]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [66]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(5, shuffle=True, random_state=42)

In [67]:
def f1_lgbm_metric(true, pred):
    pred = (pred >= 0.5).astype(int)
    return "f1-score", f1_score(true,pred), True

In [69]:
params = {
    "random_state": 42,
    "n_estimators": 500,
     "learning_rate": 0.05,
    "max_depth": 10,
    "num_leaves": 63,
    "min_child_samples": 5,                  # 리프 노드에 필요한 최소 샘플 수 증가: 더 많은 데이터를 요구하여 유의미한 분할 유도
    #"min_data_in_leaf": 10,
    "n_jobs": -1,
    "early_stopping_round": 50,
    "force_row_wise": True,
}

scores = []
models = []
for tri, vai in cv.split(train_ft, target):
    # 학습데이터
    x_train = train_ft.iloc[tri]
    y_train = target.iloc[tri]

    # 검증 데이터
    x_valid = train_ft.iloc[vai]
    y_valid = target.iloc[vai]

    model = LGBMClassifier(**params)
    model.fit(x_train,y_train, eval_set = [(x_valid, y_valid)], eval_metric = f1_lgbm_metric)

    models.append(model)

    pred = model.predict(x_valid)
    score = f1_score(y_valid, pred, average = "macro")
    scores.append(score)

print("f1스코어 :", np.mean(scores))

[LightGBM] [Info] Number of positive: 4699, number of negative: 7253
[LightGBM] [Info] Total Bins 27108
[LightGBM] [Info] Number of data points in the train set: 11952, number of used features: 629
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393156 -> initscore=-0.434065
[LightGBM] [Info] Start training from score -0.434065
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.534803	valid_0's f1-score: 0.649067
[LightGBM] [Info] Number of positive: 4699, number of negative: 7253
[LightGBM] [Info] Total Bins 27104
[LightGBM] [Info] Number of data points in the train set: 11952, number of used features: 630
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393156 -> initscore=-0.434065
[LightGBM] [Info] Start training from score -0.434065
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[104]	valid_0's binary_logloss: 0.541077	valid_0's f1-score: 0.632935
[Lig

In [70]:
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [71]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [72]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


In [73]:
submit.to_csv(f"{DATA_PATH}v1.0_LGBM.csv", index=False)

##optuna

In [35]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [36]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, KFold

In [37]:
def objective(trial):
    hp = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "criterion": trial.suggest_categorical("criterion",["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "max_features" : trial.suggest_float("max_features", 0.6, 1.0, step=0.05),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),  # 사용할 데이터 비율
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),  # 배깅 빈도
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = LGBMClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [38]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체모델역할을 하는 샘플러 객체

# 스터디 객체
study = optuna.create_study(direction="maximize",sampler = sampler)
study.optimize(objective, n_trials=100)

print("최적 :", study.best_trial.params)

[I 2024-11-04 01:45:31,006] A new study created in memory with name: no-name-7d1b1476-93ff-460c-ac60-cc260cf60e58
[I 2024-11-04 01:46:22,834] Trial 0 finished with value: 0.712033418839052 and parameters: {'n_estimators': 437, 'criterion': 'gini', 'max_depth': 10, 'num_leaves': 32, 'min_samples_split': 4, 'max_features': 0.6, 'bagging_fraction': 0.9330880728874675, 'bagging_freq': 7}. Best is trial 0 with value: 0.712033418839052.
[I 2024-11-04 01:47:42,907] Trial 1 finished with value: 0.7099364150226329 and parameters: {'n_estimators': 737, 'criterion': 'entropy', 'max_depth': 13, 'num_leaves': 37, 'min_samples_split': 5, 'max_features': 0.65, 'bagging_fraction': 0.6521211214797689, 'bagging_freq': 6}. Best is trial 0 with value: 0.712033418839052.
[I 2024-11-04 01:48:08,605] Trial 2 finished with value: 0.7121240575693404 and parameters: {'n_estimators': 489, 'criterion': 'entropy', 'max_depth': 4, 'num_leaves': 43, 'min_samples_split': 8, 'max_features': 0.8, 'bagging_fraction': 0.

최적 : {'n_estimators': 323, 'criterion': 'entropy', 'max_depth': 4, 'num_leaves': 71, 'min_samples_split': 17, 'max_features': 0.8, 'bagging_fraction': 0.8154571896905384, 'bagging_freq': 3}


In [39]:
print(study.best_trial.params)
print(study.best_value)

{'n_estimators': 323, 'criterion': 'entropy', 'max_depth': 4, 'num_leaves': 71, 'min_samples_split': 17, 'max_features': 0.8, 'bagging_fraction': 0.8154571896905384, 'bagging_freq': 3}
0.7199165213921873


In [40]:
model = LGBMClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft, target)

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27221
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 548
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


In [41]:
pred = model.predict(test_ft)
pred



array([0., 0., 0., ..., 0., 0., 0.])

In [42]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


In [43]:
submit.to_csv(f"{DATA_PATH}optuna_LGBM_v1.csv",index=False)

# cv 점수 확인해보기

In [None]:
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import KFold
#cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [None]:
#from lightgbm import LGBMClassifier
#
#model = LGBMClassifier(random_state=SEED)
#scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
#np.mean(scores)

# 모델 학습

In [None]:
#model = LGBMClassifier(random_state=SEED)
#model.fit(train_ft,target)

# 테스트 데이터 예측

In [None]:
#pred = model.predict(test_ft)
#pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
#submit

In [None]:
#submit["target"] = pred
#submit

- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
#submit.to_csv(f"{DATA_PATH}submit.csv",index=False)