In [1]:
import random
import pickle
import torch
from collections import Counter
import json
import os

In [3]:
params = {'PAD_WORD' : 'PAD', 'PAD_IDX' : 0, 'UNK_WORD' : 'UNK', 'UNK_IDX' : 1}

In [4]:
with open('data/products.pkl', 'rb') as data:
    all_prod = pickle.load(data)

In [5]:
all_prod[0]

['추천',
 'bags',
 12016021001,
 ['여자',
  '가방',
  '쇼핑몰',
  '여자',
  '옷',
  '쇼핑몰',
  '추천',
  '대여',
  '쇼핑몰',
  '대여',
  '쇼핑몰',
  '여자',
  '지갑',
  '추천',
  '여자',
  '동전지갑',
  '귀여운',
  '동전지갑',
  '퍼',
  '동전지갑',
  '여자',
  '겨울',
  '코디',
  '여자',
  '지갑',
  '쇼핑몰'],
 ['SOS',
  ('귀여운', 'Adjective'),
  ('동전지갑', 'Noun'),
  ('♡', 'Foreign'),
  ('복실복실', 'Adverb'),
  ('귀여운', 'Adjective'),
  ('털', 'Noun'),
  ('동전지갑', 'Noun'),
  ('입니', 'Adjective'),
  ('다', 'Eomi'),
  ('.', 'Punctuation'),
  ('가방', 'Noun'),
  ('키링', 'Noun'),
  ('으로도', 'Josa'),
  ('괜찮', 'Adjective'),
  ('구요', 'Eomi'),
  ('~', 'Punctuation'),
  ('열쇠', 'Noun'),
  ('동전', 'Noun'),
  ('립스틱', 'Noun'),
  ('.', 'Punctuation'),
  ('등', 'Noun'),
  ('작은', 'Adjective'),
  ('물건', 'Noun'),
  ('들', 'Suffix'),
  ('.', 'Punctuation'),
  ('넣고', 'Verb'),
  ('다니', 'Verb'),
  ('기', 'Noun'),
  ('에도', 'Josa'),
  ('안성맞춤', 'Noun'),
  ('입니', 'Adjective'),
  ('다', 'Eomi'),
  'EOS']]

In [6]:
class Vocab:
    def __init__(self, which):
        self.which = which
        self.words = Counter()
        self.truncwords = []
        if self.which == "review":
            self.word2idx = {'UNK': 0, 'PAD' : 1, 'SOS' : 2, 'EOS' : 3}
            self.idx2word = {0 : 'UNK', 1 : 'PAD', 2 : 'SOS', 3 : 'EOS'}
        elif self.which == "tag":
            self.word2idx = {'UNK' : 0, 'PAD' : 1}
            self.idx2word = {0 : 'UNK', 1 : 'PAD'}
            
    def build_vocab(self, data):
        if self.which == "review":
            for p in data:
                tokens =  p[-1]
                self.words.update(tokens)
        elif self.which == "tag":
            for p in data:
                tokens = p[-2]
                self.words.update(tokens)
        self.trunc_words = [tok for tok, count in self.words.items()]
        
    def init_vocab(self):
        if self.which == "review":
            self.word2idx = {'UNK': 0, 'PAD' : 1, 'SOS' : 2, 'EOS' : 3}
            self.idx2word = {0 : 'UNK', 1 : 'PAD', 2 : 'SOS', 3 : 'EOS'}
        elif self.which == "tag":
            self.word2idx = {'UNK' : 0, 'PAD' : 1}
            self.idx2word = {0 : 'UNK', 1 : 'PAD'}
            
    def filter_by_freq(self, min_count):
        trunc_words = [tok for tok, count in self.words.items() if count >= min_count]
        print(len(trunc_words), "out of" , len(self.words), "words left, which is",
              len(trunc_words)/len(self.words)*100.0, "%")
        self.trunc_words = trunc_words
        self.init_vocab()
        
    def build_idx_mapping(self, min_count = 0):
        if min_count > 0:
            self.filter_by_freq(min_count)
        for t in self.trunc_words:
            if t not in self.word2idx:
                self.idx2word[len(self.word2idx)] = t
                self.word2idx[t] = len(self.word2idx)
            else:
                pass

In [7]:
tagVocab = Vocab(which = 'tag')
rvVocab = Vocab(which = 'review')

In [8]:
tagVocab.build_vocab(all_prod)
rvVocab.build_vocab(all_prod)

In [9]:
tagVocab.build_idx_mapping(min_count = 4)

3158 out of 3465 words left, which is 91.13997113997114 %


In [10]:
rvVocab.build_idx_mapping(min_count = 50)

3527 out of 37012 words left, which is 9.529341835080514 %


In [11]:
tag2idx, idx2tag = tagVocab.word2idx, tagVocab.idx2word
rv2idx, idx2rv = rvVocab.word2idx, rvVocab.idx2word

In [12]:
class Products:
    def __init__(self):
        # rating 
        self.rating2idx = {}
        # category 
        self.upcat2idx = {}
        self.lowcat2idx = {}
    
    # Rating
    def addRating(self, rating_list):
        self.rating2idx = {}
        for rate in set(rating_list):
            self.rating2idx[rate] = len(self.rating2idx)
    
    # Category
    def addCategory(self, cat_list, which):
        if which == 'upper':
            self.upcat2idx = {}
            for cat in set(cat_list):
                self.upcat2idx[cat] = len(self.upcat2idx)
        elif which == 'lower':
            self.lowcat2idx = {}
            for cat in set(cat_list):
                self.lowcat2idx[cat] = len(self.lowcat2idx)
        

## 카테고리, 만족도 정보 담은 객체 만들기

In [13]:
upper = ['top', 'outer', 'bottom', 'shoes', 'bags']

In [14]:
top = ['12015003002','12015003003','12015003004',
       '12015003005','12015003006','12015003007']
outer = ['12015001001', '12015004001', '12015004002', '12015004003', '12015004004']
bottom = ['12015009001', '12015009002', '12015009003', '12015009005', '12015009004']
shoes = ['12016013001001', '12016013003001', '12016013007001', '12016013001002', '12016013002001',
         '12016013004004', '12016013003002', '12016013002002', '12016013004005', '12016013001003',
         '12016013004002', '12016013003003', '12016013001004', '12016013005', '12016013004003',
         '12016013003004', '12016013001005', '12016013006', '12016013003005', '12016013007003',
         '12016013002003', '12016013004001', '12016013008', '12016013009', '12016013001006',
         '12016013003006', '12016013001007', '12016013003007', '12016013007004', '12016013007002',
         '12016013010', '12016013001008', '12016013001009','12016013004006']
bags = ['12016021001', '12016021002', '12016021003', '12016001001',
        '12016001004001', '12016001002', '12016001003', '12016001004002',
        '12016021004', '12016001004003', '12016001004004', '12016021005',
        '12016021006', '12016021007', '12016021008', '12016001004006',
        '12016001005', '12016001006', '12016001007', '12016001008', '12016001009']

lower = top + outer + bottom + shoes + bags

In [15]:
Rating = ['추천', '적극추천', '만족', '보통', '불만', '추천안함']

In [16]:
meta = Products()

In [17]:
meta.addCategory(cat_list = lower, which = 'lower')
meta.addCategory(cat_list = upper, which = 'upper')
meta.addRating(Rating)

In [18]:
meta.rating2idx

{'만족': 4, '보통': 5, '불만': 3, '적극추천': 2, '추천': 1, '추천안함': 0}

In [19]:
meta.lowcat2idx

{'12015001001': 53,
 '12015003002': 43,
 '12015003003': 42,
 '12015003004': 7,
 '12015003005': 70,
 '12015003006': 65,
 '12015003007': 63,
 '12015004001': 22,
 '12015004002': 47,
 '12015004003': 55,
 '12015004004': 40,
 '12015009001': 15,
 '12015009002': 10,
 '12015009003': 60,
 '12015009004': 14,
 '12015009005': 11,
 '12016001001': 18,
 '12016001002': 35,
 '12016001003': 49,
 '12016001004001': 52,
 '12016001004002': 1,
 '12016001004003': 2,
 '12016001004004': 56,
 '12016001004006': 4,
 '12016001005': 59,
 '12016001006': 61,
 '12016001007': 62,
 '12016001008': 8,
 '12016001009': 31,
 '12016013001001': 0,
 '12016013001002': 13,
 '12016013001003': 45,
 '12016013001004': 26,
 '12016013001005': 57,
 '12016013001006': 27,
 '12016013001007': 33,
 '12016013001008': 48,
 '12016013001009': 69,
 '12016013002001': 50,
 '12016013002002': 19,
 '12016013002003': 58,
 '12016013003001': 23,
 '12016013003002': 44,
 '12016013003003': 46,
 '12016013003004': 36,
 '12016013003005': 41,
 '12016013003006': 2

## 벡터로 인코딩! 드디어

In [20]:
from tqdm import tqdm_notebook

In [21]:
def review_to_num(review, metadict, vocab_tag, vocab_rv, cat = 'lower'):
    # 리뷰의 정보를 분리
    rating = review[0]
    if cat == 'both':
        uppercat = review[1]
    lowercat = str(review[2])
    tags = review[3]
    text = review[4]
        
    rating_num = torch.tensor([metadict.rating2idx.get(rating)]).type(torch.long)
    #cat_num = torch.tensor([metadict.upcat2idx.get(uppercat)]) # upper category
    cat_num = torch.tensor([metadict.lowcat2idx.get(lowercat)]).type(torch.long) # lower category
    tag_num = torch.tensor([vocab_tag.word2idx.get(t, params['UNK_IDX']) for t in tags]).type(torch.long)
    rv_num = torch.tensor([vocab_rv.word2idx.get(w, params['UNK_IDX']) for w in text]).type(torch.long)
    
    return [rating_num, cat_num, tag_num, rv_num]

In [22]:
def prepareData(all_reviews, metadict, vocab_tag, vocab_rv):
    """
    all_reviews : 모든 리뷰에 대한 리스트
    p : Product 클라스의 객체 (인코딩할때 참조!)
    """
    encode_prod = [] # 숫자의 리스트
    for review in tqdm_notebook(all_reviews): # 일단 열개만 해봅시다
        encode_prod.append(review_to_num(review, metadict, vocab_tag, vocab_rv)) 
    return encode_prod

In [23]:
encoded_data = prepareData(all_prod, metadict = meta, 
                           vocab_rv = rvVocab, vocab_tag = tagVocab)

HBox(children=(IntProgress(value=0, max=163733), HTML(value='')))




In [24]:
import numpy as np

In [26]:
def pad(batch, which):
    if which == 'tag':
        idx = 2
    elif which == 'review':
        idx = 3
    max_len = np.max([len(sample[idx]) for sample in batch])
    tag_padding = params['PAD_IDX']
    #batch
    batch_data = tag_padding*np.ones((len(batch), max_len))
    for j in range(len(batch)):
        cur_len = len(batch[j][idx])
        if cur_len > 0:
            batch_data[j][:cur_len] = np.array(batch[j][idx])
    batch_data = torch.from_numpy(batch_data)
    batch_data = torch.tensor(batch_data).type(torch.long)
    return batch_data

In [27]:
def data_iterator(data, batch_size):
    batch = random.sample(data, batch_size)
    
    rating = torch.cat([sample[0] for sample in batch], dim=-1).view(-1,1)
    category = torch.cat([sample[1] for sample in batch], dim=-1).view(-1,1)
    
    tag = pad(batch = batch, which = 'tag')
    review = pad(batch = batch, which = 'review')
    
    return rating, category, tag, review

In [28]:
r, c, t, rv = data_iterator(data = encoded_data, batch_size = 5)

tensor([[    2,  3360,  1335,   521,  2568,   355,  1749,   235,   144,
           102,   680,   235,  2315,  1188,   212,   112,    46,    94,
            40,    41,  3157,    40,    13,  1430,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [    2,   140,   394,  1068,   276,   787,    12,   383,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [    2,  2992,   130,   870,  1376,  3000,  1288,   770,    83,
          2952,   538,   308,   178,    45,  1192,  3305,    38,    12,
            55,    96,  1793,    29,   392,   393,  2992,   130,   869,
          1376,   503,   666,   103,   413,    13,  3184,  1614,     7,
    