In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 데이터 경로 변수

In [2]:
DATA_PATH = "/content/drive/MyDrive/final_pj/final_project_data/"
DATA_PATH

'/content/drive/MyDrive/final_pj/final_project_data/'

- 시드값

In [3]:
SEED = 42

- 데이터 불러오기

In [4]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [5]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_v2.0_1101.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_v2.0_1101.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 465), (12225, 465))

# 결측치 처리

In [6]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0


In [7]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0


# 특성 공학(Feature Engineering)

- ID 변수 제외

In [8]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 464), (12225, 464))

## Feature Encoding

In [9]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류_수정,7
대분류_수정_평균금액최대,7


In [10]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [11]:
import category_encoders as ce

In [12]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_중분류", "주구매_대분류_수정", "대분류_수정_평균금액최대"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_중분류", "주구매_대분류_수정", "대분류_수정_평균금액최대"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 728), (12225, 728))

In [None]:
#enc = ce.count.CountEncoder()
#train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
#test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])
#
#train_ft.shape, test_ft.shape

- 문자열 피처 삭제

In [13]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류_수정', '대분류_수정_평균금액최대']

In [14]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 724), (12225, 724))

In [15]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()

In [18]:
scaler = MinMaxScaler()
scaler.fit(train_ft)

In [19]:
train_ft[train_ft.columns] = scaler.transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,주구매_대분류_수정_5,주구매_대분류_수정_6,주구매_대분류_수정_7,대분류_수정_평균금액최대_1,대분류_수정_평균금액최대_2,대분류_수정_평균금액최대_3,대분류_수정_평균금액최대_4,대분류_수정_평균금액최대_5,대분류_수정_평균금액최대_6,대분류_수정_평균금액최대_7
0,0.041494,0.130682,0.25,0.05,0.25,0.4,0.3,0.5,0.090909,0.545455,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.082988,0.090909,0.02381,0.357143,0.166667,0.357143,0.119048,0.5,0.111111,0.909091,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.228216,0.034091,0.210526,0.464912,0.140351,0.175439,0.219298,0.0,0.115079,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.377593,0.017045,0.189573,0.379147,0.180095,0.236967,0.203791,0.5,0.14372,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.107884,0.0625,0.258065,0.112903,0.612903,0.209677,0.064516,0.666667,0.144033,0.818182,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# 정답 데이터

In [20]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


##optuna

In [21]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [22]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, KFold

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [23]:
optuna.trial.Trial.suggest_int

In [25]:
optuna.trial.Trial.suggest_float

In [26]:
optuna.trial.Trial.suggest_categorical

In [27]:
def objective(trial):
    hp = {
        "n_estimators": trial.suggest_int("n_estimators", 80, 250),
        "criterion": trial.suggest_categorical("criterion",["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "max_features" : trial.suggest_float("max_features", 0.6, 1.0, step=0.05)
    }
    cv = KFold(5, shuffle=True, random_state=SEED)
    model = LGBMClassifier(**hp, random_state=SEED)
    score = cross_val_score(model, train_ft, target, cv=cv, scoring="f1_macro", n_jobs=-1).mean()
    return score

In [28]:
sampler = optuna.samplers.TPESampler(seed=SEED) # 대체모델역할을 하는 샘플러 객체

# 스터디 객체
study = optuna.create_study(
    direction="maximize", # 평가지표에 따란 다르게 전달해야함. minimize
    sampler = sampler
)
study.optimize(objective, n_trials=50)

[I 2024-11-01 07:04:05,281] A new study created in memory with name: no-name-0a7993d3-1ada-4127-a65b-3566383e07f7
[I 2024-11-01 07:04:30,777] Trial 0 finished with value: 0.7132840341237022 and parameters: {'n_estimators': 144, 'criterion': 'gini', 'max_depth': 14, 'min_samples_split': 4, 'max_features': 0.65}. Best is trial 0 with value: 0.7132840341237022.
[I 2024-11-01 07:04:45,678] Trial 1 finished with value: 0.7116098518608682 and parameters: {'n_estimators': 89, 'criterion': 'gini', 'max_depth': 16, 'min_samples_split': 2, 'max_features': 1.0}. Best is trial 0 with value: 0.7132840341237022.
[I 2024-11-01 07:05:09,282] Trial 2 finished with value: 0.7104910374507029 and parameters: {'n_estimators': 222, 'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 7, 'max_features': 0.8}. Best is trial 0 with value: 0.7132840341237022.
[I 2024-11-01 07:05:27,818] Trial 3 finished with value: 0.7131147223777637 and parameters: {'n_estimators': 153, 'criterion': 'entropy', 'max_depth'

In [29]:
study.best_params

{'n_estimators': 163,
 'criterion': 'entropy',
 'max_depth': 11,
 'min_samples_split': 19,
 'max_features': 0.8}

In [30]:
study.best_trial.params

{'n_estimators': 163,
 'criterion': 'entropy',
 'max_depth': 11,
 'min_samples_split': 19,
 'max_features': 0.8}

In [46]:
model = LGBMClassifier(random_state=SEED, **study.best_params)
model.fit(train_ft, target.astype(int))

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27221
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 548
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


In [41]:
#pred = model.predict(test_ft)  # 예측값 (정수형으로)
#predictions_proba = model.predict_proba(test_ft)



In [47]:
pred = model.predict(test_ft)
pred



array([0, 0, 0, ..., 0, 0, 0])

In [48]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,1
4,test_4,1
...,...,...
12220,test_12220,1
12221,test_12221,0
12222,test_12222,0
12223,test_12223,0


In [37]:
submit.to_csv(f"{DATA_PATH}ksj_submit.csv",index=False)

# cv 점수 확인해보기

In [None]:
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import KFold
#cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [None]:
#from lightgbm import LGBMClassifier
#
#model = LGBMClassifier(random_state=SEED)
#scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
#np.mean(scores)

# 모델 학습

In [None]:
#model = LGBMClassifier(random_state=SEED)
#model.fit(train_ft,target)

# 테스트 데이터 예측

In [None]:
#pred = model.predict(test_ft)
#pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
#submit

In [None]:
#submit["target"] = pred
#submit

- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
#submit.to_csv(f"{DATA_PATH}submit.csv",index=False)