In [5]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# 딕셔너리를 만드는 범용 함수 생성 (-> 리뷰, 태그에 대해 각각의 딕셔너리 만들어야함!)
from collections import Counter
import json
import os

In [131]:
params = {'PAD_WORD' : 'PAD', 'PAD_IDX' : 0, 'UNK_WORD' : 'UNK', 'UNK_IDX' : 1}

In [6]:
with open('data/products.pkl', 'rb') as data:
    all_prod = pickle.load(data)

In [68]:
class Vocab:
    def __init__(self, which):
        self.which = which
        self.words = Counter()
        self.truncwords = []
        if self.which == "review":
            self.word2idx = {'UNK': 0, 'PAD' : 1, 'SOS' : 2, 'EOS' : 3}
            self.idx2word = {0 : 'UNK', 1 : 'PAD', 2 : 'SOS', 3 : 'EOS'}
        elif self.which == "tag":
            self.word2idx = {'UNK' : 0, 'PAD' : 1}
            self.idx2word = {0 : 'UNK', 1 : 'PAD'}
            
    def build_vocab(self, data):
        if self.which == "review":
            for p in data:
                tokens =  p[-1]
                self.words.update(tokens)
        elif self.which == "tag":
            for p in data:
                tokens = p[-2]
                self.words.update(tokens)
        self.trunc_words = [tok for tok, count in self.words.items()]
        
    def init_vocab(self):
        if self.which == "review":
            self.word2idx = {'UNK': 0, 'PAD' : 1, 'SOS' : 2, 'EOS' : 3}
            self.idx2word = {0 : 'UNK', 1 : 'PAD', 2 : 'SOS', 3 : 'EOS'}
        elif self.which == "tag":
            self.word2idx = {'UNK' : 0, 'PAD' : 1}
            self.idx2word = {0 : 'UNK', 1 : 'PAD'}
            
    def filter_by_freq(self, min_count):
        trunc_words = [tok for tok, count in self.words.items() if count >= min_count]
        print(len(trunc_words), "out of" , len(self.words), "words left, which is",
              len(trunc_words)/len(self.words)*100.0, "%")
        self.trunc_words = trunc_words
        self.init_vocab()
        
    def build_idx_mapping(self, min_count = 0):
        if min_count > 0:
            self.filter_by_freq(min_count)
        for t in self.trunc_words:
            if t not in self.word2idx:
                self.idx2word[len(self.word2idx)] = t
                self.word2idx[t] = len(self.word2idx)
            else:
                pass

In [69]:
tagVocab = Vocab(which = 'tag')
rvVocab = Vocab(which = 'review')

In [70]:
tagVocab.build_vocab(all_prod)
rvVocab.build_vocab(all_prod)

In [74]:
tagVocab.build_idx_mapping(min_count = 4)

3158 out of 3465 words left, which is 91.13997113997114 %


In [1]:
rvVocab.build_idx_mapping(min_count = 50)

NameError: name 'rvVocab' is not defined

In [77]:
tag2idx, idx2tag = tagVocab.word2idx, tagVocab.idx2word
rv2idx, idx2rv = rvVocab.word2idx, rvVocab.idx2word

In [175]:
class Products:
    def __init__(self):
        # rating 
        self.rating2idx = {}
        # category 
        self.upcat2idx = {}
        self.lowcat2idx = {}
    
    # Rating
    def addRating(self, rating_list):
        self.rating2idx = {}
        for rate in set(rating_list):
            self.rating2idx[rate] = len(self.rating2idx)
    
    # Category
    def addCategory(self, cat_list, which):
        if which == 'upper':
            self.upcat2idx = {}
            for cat in set(cat_list):
                self.upcat2idx[cat] = len(self.upcat2idx)
        elif which == 'lower':
            self.lowcat2idx = {}
            for cat in set(cat_list):
                self.lowcat2idx[cat] = len(self.lowcat2idx)
        

## 카테고리, 만족도 정보 담은 객체 만들기

In [176]:
upper = ['top', 'outer', 'bottom', 'shoes', 'bags']

In [177]:
top = ['12015003002','12015003003','12015003004',
       '12015003005','12015003006','12015003007']
outer = ['12015001001', '12015004001', '12015004002', '12015004003', '12015004004']
bottom = ['12015009001', '12015009002', '12015009003', '12015009005', '12015009004']
shoes = ['12016013001001', '12016013003001', '12016013007001', '12016013001002', '12016013002001',
         '12016013004004', '12016013003002', '12016013002002', '12016013004005', '12016013001003',
         '12016013004002', '12016013003003', '12016013001004', '12016013005', '12016013004003',
         '12016013003004', '12016013001005', '12016013006', '12016013003005', '12016013007003',
         '12016013002003', '12016013004001', '12016013008', '12016013009', '12016013001006',
         '12016013003006', '12016013001007', '12016013003007', '12016013007004', '12016013007002',
         '12016013010', '12016013001008', '12016013001009','12016013004006']
bags = ['12016021001', '12016021002', '12016021003', '12016001001',
        '12016001004001', '12016001002', '12016001003', '12016001004002',
        '12016021004', '12016001004003', '12016001004004', '12016021005',
        '12016021006', '12016021007', '12016021008', '12016001004006',
        '12016001005', '12016001006', '12016001007', '12016001008', '12016001009']

lower = top + outer + bottom + shoes + bags

In [178]:
Rating = ['추천', '적극추천', '만족', '보통', '불만', '추천안함']

In [179]:
meta = Products()

In [180]:
meta.addCategory(cat_list = lower, which = 'lower')
meta.addCategory(cat_list = upper, which = 'upper')
meta.addRating(Rating)

In [181]:
meta.rating2idx

{'추천': 0, '적극추천': 1, '불만': 2, '보통': 3, '만족': 4, '추천안함': 5}

In [182]:
meta.lowcat2idx

{'12016013008': 0,
 '12016001007': 1,
 '12016013003004': 2,
 '12016013001001': 3,
 '12016001004002': 4,
 '12016013001006': 5,
 '12015003006': 6,
 '12016013002002': 7,
 '12016013001004': 8,
 '12016013009': 9,
 '12016013007004': 10,
 '12016013004004': 11,
 '12016013001002': 12,
 '12016013003001': 13,
 '12015004004': 14,
 '12016001002': 15,
 '12016013006': 16,
 '12016013001008': 17,
 '12016013003002': 18,
 '12016013004002': 19,
 '12016013005': 20,
 '12015003005': 21,
 '12015009004': 22,
 '12016001004004': 23,
 '12016001004003': 24,
 '12016001005': 25,
 '12016021002': 26,
 '12016013001003': 27,
 '12016021005': 28,
 '12016013001005': 29,
 '12016001004006': 30,
 '12016021008': 31,
 '12016013004003': 32,
 '12016013003007': 33,
 '12016001009': 34,
 '12015003004': 35,
 '12016013007001': 36,
 '12016001006': 37,
 '12015001001': 38,
 '12016013007002': 39,
 '12015004002': 40,
 '12015009002': 41,
 '12016013001009': 42,
 '12015004001': 43,
 '12015003002': 44,
 '12015003007': 45,
 '12016013003003': 46

## 벡터로 인코딩! 드디어

In [183]:
from tqdm import tqdm_notebook

In [184]:
all_prod[0]

['추천',
 'bags',
 12016021001,
 ['여자',
  '가방',
  '쇼핑몰',
  '여자',
  '옷',
  '쇼핑몰',
  '추천',
  '대여',
  '쇼핑몰',
  '대여',
  '쇼핑몰',
  '여자',
  '지갑',
  '추천',
  '여자',
  '동전지갑',
  '귀여운',
  '동전지갑',
  '퍼',
  '동전지갑',
  '여자',
  '겨울',
  '코디',
  '여자',
  '지갑',
  '쇼핑몰'],
 ['SOS',
  ('귀여운', 'Adjective'),
  ('동전지갑', 'Noun'),
  ('♡', 'Foreign'),
  ('복실복실', 'Adverb'),
  ('귀여운', 'Adjective'),
  ('털', 'Noun'),
  ('동전지갑', 'Noun'),
  ('입니', 'Adjective'),
  ('다', 'Eomi'),
  ('.', 'Punctuation'),
  ('가방', 'Noun'),
  ('키링', 'Noun'),
  ('으로도', 'Josa'),
  ('괜찮', 'Adjective'),
  ('구요', 'Eomi'),
  ('~', 'Punctuation'),
  ('열쇠', 'Noun'),
  ('동전', 'Noun'),
  ('립스틱', 'Noun'),
  ('.', 'Punctuation'),
  ('등', 'Noun'),
  ('작은', 'Adjective'),
  ('물건', 'Noun'),
  ('들', 'Suffix'),
  ('.', 'Punctuation'),
  ('넣고', 'Verb'),
  ('다니', 'Verb'),
  ('기', 'Noun'),
  ('에도', 'Josa'),
  ('안성맞춤', 'Noun'),
  ('입니', 'Adjective'),
  ('다', 'Eomi'),
  'EOS']]

In [185]:
review = all_prod[0]

In [186]:
tags = review[3]

In [187]:
def review_to_num(review, metadict, vocab_tag, vocab_rv, cat = 'lower'):
    # 리뷰의 정보를 분리
    rating = review[0]
    if cat == 'both':
        uppercat = review[1]
    lowercat = str(review[2])
    tags = review[3]
    text = review[4]
        
    rating_num = torch.tensor([metadict.rating2idx.get(rating)]).type(torch.long)
    #cat_num = torch.tensor([metadict.upcat2idx.get(uppercat)]) # upper category
    cat_num = torch.tensor([metadict.lowcat2idx.get(lowercat)]).type(torch.long) # lower category
    tag_num = torch.tensor([vocab_tag.word2idx.get(t, params['UNK_IDX']) for t in tags]).type(torch.long)
    rv_num = torch.tensor([vocab_rv.word2idx.get(w, params['UNK_IDX']) for w in text]).type(torch.long)
    
    return [rating_num, cat_num, tag_num, rv_num]

In [188]:
def prepareData(all_reviews, metadict, vocab_tag, vocab_rv):
    """
    all_reviews : 모든 리뷰에 대한 리스트
    p : Product 클라스의 객체 (인코딩할때 참조!)
    """
    encode_prod = [] # 숫자의 리스트
    for review in tqdm_notebook(all_reviews): # 일단 열개만 해봅시다
        encode_prod.append(review_to_num(review, metadict, vocab_tag, vocab_rv)) 
    return encode_prod

In [189]:
encoded_data = prepareData(all_prod, metadict = meta, 
                           vocab_rv = rvVocab, vocab_tag = tagVocab)

HBox(children=(IntProgress(value=0, max=184342), HTML(value='')))




In [190]:
encoded_data[10]

[tensor([ 4]),
 tensor([ 64]),
 tensor([  2,   3,   4,   2,   5,   4,   6,   7,   4,   7,   4,   2,
           8,   6,   2,   9,  10,   9,  11,   9,   2,  12,  13,   2,
           8,   4]),
 tensor([   2,  156,  157,  158,  117,  159,   88,    3])]

In [159]:
import numpy as np

In [191]:
def pad(batch, which):
    if which == 'tag':
        idx = 2
    elif which == 'review':
        idx = 3
    max_len = np.max([len(sample[idx]) for sample in batch])
    tag_padding = params['PAD_IDX']
    #batch
    batch_data = tag_padding*np.ones((len(batch), max_len))
    for j in range(len(batch)):
        cur_len = len(batch[j][idx])
        if cur_len > 0:
            batch_data[j][:cur_len] = np.array(batch[j][idx])
    batch_data = torch.LongTensor(batch_data)
    return batch_data

In [192]:
def data_iterator(data, batch_size):
    batch = random.sample(data, batch_size)
    
    rating = torch.cat([sample[0] for sample in batch], dim=-1).view(-1,1)
    category = torch.cat([sample[1] for sample in batch], dim=-1).view(-1,1)
    
    tag = pad(batch = batch, which = 'tag')
    review = pad(batch = batch, which = 'review')
    
    return rating, category, tag, review

In [193]:
r, c, t, rv = data_iterator(data = encoded_data, batch_size = 5)