In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
DATA_PATH = "/content/drive/MyDrive/final_pj/final_project_data/"
DATA_PATH

'/content/drive/MyDrive/final_pj/final_project_data/'

In [3]:
SEED = 42

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from lightgbm import LGBMClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

In [6]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_v3.3_피처삭제X_군집_1111.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_v3.3_피처삭제X_군집_1111.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 1471), (12225, 1471))

In [7]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [8]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 1470), (12225, 1470))

In [9]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_대분류,22
주구매_중분류,238


In [10]:
cols

['주구매지점', '주구매_대분류', '주구매_중분류']

In [11]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [12]:
# 주구매지점-onehot / 주구매_대중분류 - countencoder
import category_encoders as ce
enc1 = ce.one_hot.OneHotEncoder()
tmp = enc1.fit_transform(train_ft["주구매지점"])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc1.transform(test_ft["주구매지점"])
test_ft = pd.concat([test_ft,tmp],axis =1)


enc2 = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc2.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc2.transform(test_ft[["주구매_중분류"]])

train_ft["주구매_대분류_cnt"] = enc2.fit_transform(train_ft[["주구매_대분류"]])
test_ft["주구매_대분류_cnt"] = enc2.transform(test_ft[["주구매_대분류"]])
train_ft.shape, test_ft.shape

((14940, 1476), (12225, 1476))

In [13]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 1473), (12225, 1473))

In [14]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

In [15]:
train_ft["18시_21시_구매비율"] = train_ft["18시_21시_구매비율"] * 3
train_ft["18시_21시_구매횟수"] = train_ft["18시_21시_구매횟수"] * 3

test_ft["18시_21시_구매비율"] = test_ft["18시_21시_구매비율"] * 3
test_ft["18시_21시_구매횟수"] = test_ft["18시_21시_구매횟수"] * 3

In [16]:
scaler = MinMaxScaler()
scaler.fit(train_ft)
train_ft[train_ft.columns] = scaler.transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)

In [17]:
train_ft.head()

Unnamed: 0,내점일수,구매주기,주말방문비율,평일방문비율,주말방문횟수,평일방문횟수,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,...,공휴일_대분류_잡화_구매횟수,공휴일_대분류_케주얼_구두_아동_구매횟수,공휴일_대분류_패션잡화_구매횟수,cluster,주구매지점_1,주구매지점_2,주구매지점_3,주구매지점_4,주구매_중분류_cnt,주구매_대분류_cnt
0,0.041494,0.130682,0.25,0.75,0.032258,0.029014,0.05,0.25,0.4,0.3,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.056391,0.076439
1,0.082988,0.090909,0.02381,0.97619,0.006452,0.079304,0.357143,0.166667,0.357143,0.119048,...,0.0,0.0,0.166667,1.0,0.0,1.0,0.0,0.0,0.179041,0.306655
2,0.228216,0.034091,0.210526,0.789474,0.154839,0.174081,0.464912,0.140351,0.175439,0.219298,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.003055,0.871853
3,0.377593,0.017045,0.189573,0.810427,0.258065,0.330754,0.379147,0.180095,0.236967,0.203791,...,0.0,0.066667,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.433004
4,0.107884,0.0625,0.258065,0.741935,0.103226,0.088975,0.112903,0.612903,0.209677,0.064516,...,0.272727,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.011983,0.871853


In [18]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [19]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
from sklearn.linear_model import LogisticRegression
#params = {'n_estimators': 1000,'learning_rate': 0.027075406407767497,'max_depth': 5,'min_child_weight': 7,'subsample': 0.6789816859997232,'colsample_bytree': 0.6682531834544282,'gamma': 1.5046881781916308}
logi = LogisticRegression(random_state=42)

#model = XGBClassifier(random_state=42, **params) # 학습용 모델

fs = SelectFromModel(logi) # 특성 선택에 사용하기 위한 모델 객체를 전달해줘야함.
x = fs.fit_transform(train_ft, target) # 특성 선택이 완료된 입력 데이터가 ndarray 로 반환
#scores = cross_val_score(model, x, target, cv=cv, scoring="f1_macro", n_jobs=-1)
#scores.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
best_cols_logi = fs.get_feature_names_out()
len(best_cols_logi)

570

In [27]:
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

params_xgb = {'n_estimators': 1000,'learning_rate': 0.027075406407767497,'max_depth': 5,'min_child_weight': 7,'subsample': 0.6789816859997232,'colsample_bytree': 0.6682531834544282,'gamma': 1.5046881781916308}
params_lgbm = {'n_estimators': 813, 'learning_rate': 0.014757440400599073, 'num_leaves': 35, 'max_depth': 12, 'min_child_samples': 41, 'subsample': 0.85, 'colsample_bytree': 0.95}
params_logi = {'C': 1.2056308154836568, 'solver': 'saga', 'max_iter': 400, 'class_weight': 'balanced', 'tol': 3.689794777633075e-05} # optuna trial 30 : 0.727821596703826.
params_mlp = {'hidden_layer_sizes': (88, 89),'activation': 'relu','solver': 'adam','alpha': 0.001230314130334757,'learning_rate_init': 0.0016991055449377132,'max_iter': 500,'validation_fraction': 0.15000000000000002,'n_iter_no_change': 10,'early_stopping': True}

estimators = [
    ("lgbm", LGBMClassifier(random_state=42, **params_lgbm)),
    ("xgb", XGBClassifier(random_state=42, **params_xgb) ),
    ("logi", LogisticRegression(random_state=42, **params_logi) ),
    ("mlp", MLPClassifier(random_state=42, **params_mlp) ),

]
parmas = {
    "estimators": estimators,
    "voting" : "soft",
    "n_jobs" : -1
}


model = VotingClassifier(**parmas)
scores = cross_val_score(model, train_ft[best_cols_logi], target, cv=cv, scoring="f1_macro", n_jobs=-1)
scores.mean()
#0.7324792762779355 <- voting5
#0.7356366727534078 <- voting8
#0.7329436667642921 <- voting 10

0.7329436667642921

In [28]:
model.fit(train_ft[best_cols_logi],target)



In [29]:
DATA_PATH1 = "/content/drive/MyDrive/final_pj/1113/"
pred = model.predict(test_ft[best_cols_logi])
submit["target"] = pred
submit.to_csv(f"{DATA_PATH1}voting10_mlpadd_fs_V3.3_1113.csv",index=False)

pred_proba = model.predict_proba(test_ft[best_cols_logi])[:,1]
submit["target"] = pred_proba
submit.to_csv(f"{DATA_PATH1}voting10_mlpadd_fs_V3.3_proba_1113.csv",index=False)