In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/reviews.csv')

In [3]:
df.head()

Unnamed: 0,review,rating
0,총알파티총알파티총알파티,10
1,뭐라고 딱히 할말이 없습니다.,5
2,무슨 좀비영화 보는줄....,6
3,스토리는 괜찮은데.. 중간 중간 긴장감이 좀 떨어짐 그런데로 볼만함,6
4,이야기 전개는 스피디하지만 뻔한 결말이어서 그럭저럭 킬링타임용에 그치는군,6


In [4]:
# coding: utf-8
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
import torch


class CNNReg(nn.Module):
    def __init__(self, vocaNum, embedding_dim):
        super(CNNReg, self).__init__()
        self.kernel_size = [2, 3, 4, 5]
        self.channel_out = 10
        self.embedding = nn.Embedding(vocaNum, embedding_dim)
        self.conv1 = nn.ModuleList([nn.Conv2d(1, self.channel_out, (k, embedding_dim)) for k in self.kernel_size])
        self.linear1 = nn.Linear(self.channel_out*len(self.kernel_size), 10)
        self.linear2 = nn.Linear(10, 1)
        self.dropout = nn.Dropout()

    def forward(self, x):
        embed = self.embedding(x)   # (N, W, D)
        embed = embed.unsqueeze(1)  # (N,1,W,D), 1: channel_in

        # [(N,Channel_out,W,1), ...] * len(Kernel_size)
        feature_maps = [F.relu(conv(embed)) for conv in self.conv1]
        
        # [(N,Channel_out,W), ...] * len(Kernel_size)
        feature_maps = [feature_map.squeeze(3) for feature_map in feature_maps]

        # [(N, Channel_out), ...] * len(Kernel_size)
        pooled_output = [F.max_pool1d(feature_map, feature_map.size(2)) for feature_map in feature_maps]
        output = torch.cat(pooled_output, 1)
        output = output.view(output.size(0), -1)
        output = self.dropout(output)
        output = F.relu(self.linear1(output))
        output = self.dropout(output)
        output = self.linear2(output)
        return output

def makeBatch(batch_sequences, max_len=100):
    batch_sequences = [torch.Tensor(sequence) for sequence in batch_sequences]
    lengths = [len(sequence) for sequence in batch_sequences]
    output = torch.zeros(len(batch_sequences), max_len)
    for i, sequence in enumerate(batch_sequences):
        length = lengths[i]
        output[i, :length] = sequence[:length]
    output = output.long()
    return output

In [5]:
from sklearn.model_selection  import train_test_split

In [6]:
from konlpy.tag import Twitter
tagger = Twitter()

In [7]:
from collections import defaultdict, Counter
from random import shuffle

In [8]:
words = defaultdict(list)
datas = list()

In [9]:
min_len, max_len = 3, 100

In [10]:
for review, rate in df.values:
    try:
        if len(review) > min_len and len(review) < max_len:
            morphs = tagger.morphs(review)
            words[int(rate)].append(morphs)
            datas.append((morphs, rate))
    except:
        pass

In [18]:
import pickle
words = pickle.load(open('words.p', 'rb'))
datas = pickle.load(open('datas.p', 'rb'))

In [19]:
voc = Counter()
for score, tags in words.items():
    for tag in tags:
        voc.update(tag)

In [20]:
voca_num = 20000
inv = [v[0] for v in voc.most_common(voca_num - 2)]
inv.append('<UNK>')
inv.insert(0, '<PAD>')

In [21]:
vin = {v: i for i, v in enumerate(inv)}

In [22]:
samples = [([(vin[word] if word in vin else vin['<UNK>']) for word in review], rating) for review, rating in datas]

In [24]:
train, test = train_test_split(samples, test_size=.2, random_state=0)

In [25]:
trainX, trainY = map(list, zip(*train))
testX, testY = map(list, zip(*test))

In [26]:
embedding_dim = 200
epoch = 10
batch_size = 32
num_iter = int(len(train) / batch_size)

In [27]:
reg = CNNReg(len(voc), embedding_dim)
reg.cuda()

criterion = nn.MSELoss()
opt = torch.optim.Adam(reg.parameters())

In [28]:
for e in range(epoch):
    for i in range(num_iter):
        opt.zero_grad()
        batchX = trainX[i*batch_size:(i+1)*batch_size]
        batchY = trainY[i*batch_size:(i+1)*batch_size]
        batchX = makeBatch(batchX)
        batchY = torch.FloatTensor(batchY)

        batchX = Variable(batchX).cuda()
        batchY = Variable(batchY).cuda()

        predict = reg(batchX)
        loss = criterion(predict, batchY)
        loss.backward()
        opt.step()

In [29]:
torch.save(reg.state_dict(), 'cnn_regression.pkl')

In [35]:
with open('../sample_data/movie_review/train/train_data') as f:
    test_text = list(map(lambda line: line.strip(), f.readlines()))
with open('../sample_data/movie_review/train/train_label') as f:
    test_label = list(map(lambda line: int(line.strip()), f.readlines()))

In [76]:
def predict(text):
    if isinstance(text, str):
        text = [[vin[word] if word in vin else vin['<UNK>'] for word in tagger.morphs(text)]]
    else:
        text = [text]
    text = makeBatch(text)
    text = Variable(text)
    return reg(text).data.tolist()[0][0]

In [58]:
reg = CNNReg(len(voc), embedding_dim)
reg.load_state_dict(torch.load("cnn_regression.pkl"))
reg.eval()

CNNReg(
  (embedding): Embedding(109035, 200)
  (conv1): ModuleList(
    (0): Conv2d (1, 10, kernel_size=(2, 200), stride=(1, 1))
    (1): Conv2d (1, 10, kernel_size=(3, 200), stride=(1, 1))
    (2): Conv2d (1, 10, kernel_size=(4, 200), stride=(1, 1))
    (3): Conv2d (1, 10, kernel_size=(5, 200), stride=(1, 1))
  )
  (linear1): Linear(in_features=40, out_features=10)
  (linear2): Linear(in_features=10, out_features=1)
  (dropout): Dropout(p=0.5)
)

In [67]:
scores = [predict(text) - label for text, label in zip(test_text, test_label)]

In [85]:
scores = [abs(predict(text) - label) for text, label in zip(testX, testY)]

In [86]:
sum(scores)/len(scores)

1.788670440079557

In [106]:
predict('초딩 스토리에 발연기들 그리고 갓배운 아마추어 캠코더구준 화면구성과 연출력 이어지는 발편집 ㅋ')

5.256997108459473

In [136]:
predict('<아마추어>')

5.256997108459473