In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = 'ignore')
%matplotlib inline

# 데이터 분할
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# 모델
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 파라미터 최적화
from bayes_opt import BayesianOptimization

# 평가지표
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('train.csv', encoding = 'UTF-8')
test = pd.read_csv('test.csv', encoding = 'UTF-8')

In [3]:
data = pd.concat([train, test])

In [4]:
# SESS_DT 열의 type을 datetime으로 바꾸기
data['SESS_DT'] = pd.to_datetime(data['SESS_DT'], format= '%Y%m%d')

# TOT_SESS_HR_V 열의 ,를 지우고 int로 변환
data['TOT_SESS_HR_V'] = data['TOT_SESS_HR_V'].apply(lambda x : int(x.replace(',','')))

# PD_BRA_NM 열의 불필요한 특수문자 제거
data.PD_BRA_NM = data.PD_BRA_NM.apply(lambda x : x.replace('[','').replace(']',''))

In [19]:
train.PD_BRA_NM = train.PD_BRA_NM.apply(lambda x : x.replace('[','').replace(']',''))
test.PD_BRA_NM = test.PD_BRA_NM.apply(lambda x : x.replace('[','').replace(']',''))

In [21]:
p_level = 'PD_BRA_NM'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=516):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 20))

num_features = 300 # 단어 벡터 차원 수
min_word_count = 5 # 최소 단어 수
context = 10 # 학습 윈도우(인접한 단어 리스트) 크기
downsampling = 1e-3

from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        size = num_features, 
                        min_count = min_word_count,
                        window = context,
                        seed = 516, workers = 4, sg = 1, sample = downsampling)

w2v.init_sims(replace=True)

In [47]:
### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v' + f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v' + f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('brand2_w2v_train.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('brand2_w2v_test.csv', index=False)

In [46]:
w2v.wv.most_similar(positive = "카파(스포츠)", topn = 10)

[('슈베카', 0.40943026542663574),
 ('카파(프리미엄)', 0.38712936639785767),
 ('흥국농산', 0.38711971044540405),
 ('옥천농협', 0.377418577671051),
 ('마지스체어', 0.3757610619068146),
 ('행텐', 0.3752678632736206),
 ('카파(잠실)', 0.3686213493347168),
 ('베르베나스', 0.3674624264240265),
 ('비에이치씨', 0.3660779595375061),
 ('오도넬', 0.3547326326370239)]

In [22]:
list(data.PD_BRA_NM.unique())

['아디다스(퍼포먼스)',
 '아디다스(의류)',
 '아디다스키즈(아동)',
 '폴햄',
 '바나나팜스',
 '르꼬끄',
 '노스페이스',
 '헤링본',
 '클립',
 '라코스테(의류)',
 '데상트',
 '트렉스타',
 '내셔널지오그래픽(남성)',
 '아디다스',
 '겐조 (향수)',
 '히키스',
 '미미앤디디',
 '큐니걸스',
 '슈에무라',
 '바보사랑',
 '스카이페스티발',
 '듀듀(화장품)',
 '데코뷰',
 '탑텐',
 '우아미가구',
 '쿠키세븐',
 '에뛰드하우스',
 '솔로몬샵',
 '꼬망스(아동)',
 '노스페이스(레저)',
 '르샤트라1802',
 '베스트홈패션',
 '첨이첨이',
 '키엘',
 '디스커버리',
 '아디다스(슈즈)',
 '나이키',
 '샤넬',
 '오조크',
 '엘칸토',
 '케이투',
 '푸마(슈즈)',
 '블루피오레',
 '아이더',
 '매그제이(MAGJAY)',
 '이앤씨',
 '윈',
 '제이제이 지고트',
 '세라블라썸',
 '나이스클랍',
 '케네스 레이디',
 '아이잗컬렉션',
 '르까프',
 '허드슨테일러',
 '플라스틱아일랜드',
 '아디다스 언더웨어',
 '록시',
 '블루독',
 '락앤락',
 '마레스',
 '씨씨콜렉트',
 '맥',
 '토모톰스',
 '엘르 수영복',
 '레노마수영복',
 '에어워크주니어',
 '데상트 스포츠',
 '스위스런',
 '크록스',
 '지지피엑스',
 '피닉스',
 '산과들에',
 '바바라(슈즈)',
 '쉐모아',
 '제이제이지고트',
 '포커스',
 '비키',
 '고세(여성화)',
 '리스트',
 '손오공',
 '스튜디오화이트',
 '에이비에프지',
 '정관장',
 '유리아쥬',
 '난닝구',
 '뉴발란스(키즈)',
 '레노마(셔츠)',
 '레이지비(레저)',
 '헤지스남성',
 '헤지스 남성',
 '코디갤러리 바이 에스티코',
 '엠엘비키즈',
 '하다라보',
 '아토팜',
 '컨택유',
 '패션풀',
 '피핀',
 '폴로랄프로렌',
 '피에르가르뎅(여성)',
 '엘르

In [51]:
target = pd.read_csv('cust_train.csv', encoding = 'UTF-8')
label = target.LABEL

In [52]:
train = pd.read_csv('brand2_w2v_train.csv', encoding = 'UTF-8')
test = pd.read_csv('brand2_w2v_test.csv', encoding = 'UTF-8')

X_train, X_val, y_train, y_val = train_test_split(train, label, test_size = 0.3, random_state = 516, stratify = label)

In [53]:
pbounds = {'n_estimators' : (50, 500),
           'learning_rate' : (0.01, 1.0),
           'max_depth' : (2, 10),
           'num_leaves' : (10, 200),
           'min_child_samples' : (20, 100),
           'min_child_weight' : (1, 15),
           'subsample' : (0.75, 0.95),
           'colsample_bytree' : (0.75, 0.95)}


def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight, subsample, colsample_bytree):
    
    
    params = {'n_estimators' : int(round(n_estimators)),
              'learning_rate': learning_rate,
              'max_depth' : int(round(max_depth)),
              'num_leaves' : int(round(num_leaves)),
              'min_child_samples' : int(round(min_child_samples)),
              'min_child_weight' : min_child_weight,
              'subsample': subsample,
              'colsample_bytree' : colsample_bytree,
              'n_jobs' : -1}
    
    lgbm = LGBMClassifier(**params)
    
    skfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 516)
    
    score = cross_val_score(lgbm, X_train, y_train, cv = skfold, scoring = 'neg_log_loss', n_jobs=-1)
    
    return np.mean(score)

BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state = 516)

BO_lgbm.maximize(init_points=30, n_iter=30)

max_params = BO_lgbm.max['params']
max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))
display(max_params)

# Step9. 최대화 하이퍼파라미터로 재학습
lgbm_tun = LGBMClassifier(**max_params)
lgbm_tun.fit(X_train, y_train)

skfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 516)
scores = cross_val_score(lgbm_tun, X_val, y_val, cv = skfold, scoring = 'neg_log_loss', n_jobs=-1)

print(scores)
print(f'최대성능: {max(scores)}\n평균성능: {np.mean(scores)}')

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.336   [0m | [0m 0.9123  [0m | [0m 0.7113  [0m | [0m 3.45    [0m | [0m 76.68   [0m | [0m 16.06   [0m | [0m 435.0   [0m | [0m 140.1   [0m | [0m 0.789   [0m |
| [0m 2       [0m | [0m-2.076   [0m | [0m 0.7878  [0m | [0m 0.8463  [0m | [0m 7.098   [0m | [0m 95.81   [0m | [0m 4.343   [0m | [0m 337.7   [0m | [0m 76.55   [0m | [0m 0.8719  [0m |
| [0m 3       [0m | [0m-2.091   [0m | [0m 0.8565  [0m | [0m 0.7491  [0m | [0m 9.013   [0m | [0m 20.69   [0m | [0m 11.26   [0m | [0m 371.6   [0m | [0m 83.02   [0m | [0m 0.8451  [0m |
| [0m 4       [0m | [0m-1.818   [0m | [0m 0.8397  [0m | [0m 0.4175  [0m | [0m 14.82   [0m | [0m 31.53   [0m | [0m 18.8    [0m | [0m 322

KeyboardInterrupt: 