In [2]:
import pandas as pd
import numpy as np
import os


### Read data
train = pd.read_csv('train.csv', encoding = 'UTF-8')
test = pd.read_csv('test.csv', encoding = 'UTF-8')


### Make corpus
p_level = 'PD_NM'  # 상품 분류 수준

# W2V 학습데이터가 부족하여 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n, seed=516):
    if n == 0:
        return list(x)
    uw = np.unique(x)
    bs = np.array([])
    np.random.seed(seed)
    for j in range(n):
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_corpus = list(train.groupby('CLNT_ID')[p_level].agg(oversample, 20))
test_corpus = list(test.groupby('CLNT_ID')[p_level].agg(oversample, 20))


### Training the Word2Vec model
num_features = 200 # 단어 벡터 차원 수
min_word_count = 3 # 최소 단어 수
context = 8 # 학습 윈도우(인접한 단어 리스트) 크기

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
w2v = word2vec.Word2Vec(train_corpus, 
                        size=num_features, 
                        min_count=min_word_count,
                        window=context,
                        seed=516,workers=4, sg=1)
# 필요없는 메모리 unload
w2v.init_sims(replace=True)


### Make features
# 구매상품에 해당하는 벡터의 평균/최소/최대 벡터를 feature로 만드는 전처리기
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = num_features
    def fit(self, X):
        return self
    def transform(self, X):
        return np.array([
            np.hstack([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)]) 
            for words in X
        ]) 

# W2V 기반 feature 생성
train_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).fit(train_corpus).transform(train_corpus))
test_features = pd.DataFrame(EmbeddingVectorizer(w2v.wv).transform(test_corpus))

train_features.columns = ['v' + f'{c+1:03d}' for c in train_features.columns]
test_features.columns = ['v' + f'{c+1:03d}' for c in test_features.columns]

# 학습용과 제출용 데이터로 분리
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(train['CLNT_ID'].unique())}), train_features], axis=1).to_csv('pd_w2v_train.csv', index=False)
pd.concat([pd.DataFrame({'CLNT_ID': np.sort(test['CLNT_ID'].unique())}), test_features], axis=1).to_csv('pd_w2v_test.csv', index=False)

In [2]:
train_features

Unnamed: 0,v001,v002,v003,v004,v005,v006,v007,v008,v009,v010,...,v191,v192,v193,v194,v195,v196,v197,v198,v199,v200
0,0.098969,0.078694,0.041191,0.085657,-0.043958,-0.010371,-0.097927,-0.015191,0.108780,-0.003570,...,0.061561,-0.033346,-0.091573,0.030463,-0.174682,0.132849,-0.077821,-0.168293,0.035505,0.016552
1,-0.009044,-0.019021,-0.072177,0.010475,-0.019590,0.018309,-0.052128,-0.002662,0.140838,0.038483,...,-0.090853,0.007893,-0.002655,0.033004,-0.006596,0.059820,-0.026401,0.006049,-0.010873,-0.025131
2,-0.084555,0.080026,-0.011703,0.043118,-0.011903,-0.035526,-0.124165,0.019464,-0.028551,-0.037911,...,-0.028236,0.061603,0.042524,0.049403,0.045445,0.059976,-0.019655,-0.083764,0.067434,-0.066024
3,0.054261,0.002244,0.006381,0.037801,-0.108838,-0.075260,-0.089447,0.080865,0.010528,-0.004003,...,-0.076188,-0.011502,0.022313,0.006668,0.029630,-0.039290,-0.069833,-0.062563,0.014846,-0.038086
4,-0.104426,0.046565,-0.071211,0.026810,-0.049358,-0.016980,0.022969,0.000926,0.038140,-0.012621,...,-0.029106,0.078212,-0.047820,0.007860,0.002256,0.055206,-0.022572,-0.073286,-0.000729,-0.013718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263099,-0.013704,0.026914,0.011798,-0.016001,-0.028528,0.016610,-0.053796,0.029957,0.035165,0.023131,...,-0.038174,0.003087,0.068987,-0.002342,0.013192,0.031258,-0.031021,-0.060459,0.045274,-0.065506
263100,0.062113,-0.041186,0.080495,-0.016283,-0.049213,0.024912,-0.065230,0.027692,0.054725,0.056674,...,-0.125818,-0.115473,0.061197,0.015575,0.009241,0.058200,-0.093342,-0.001201,0.063297,0.008531
263101,-0.001082,-0.002223,0.014530,0.092002,-0.059348,-0.057525,-0.087899,0.091964,0.069786,0.016315,...,-0.071762,0.124840,0.144968,-0.052939,-0.019935,0.071264,-0.016227,-0.053440,0.094105,0.024731
263102,-0.044927,-0.050455,0.016503,0.006917,-0.048592,0.029265,-0.037317,-0.072153,0.074847,0.005330,...,0.003777,0.043821,-0.014152,0.013614,-0.010413,0.038190,-0.109581,-0.070153,-0.059916,-0.035318
