In [1]:
import random
import re
import sys
import time
from collections import defaultdict
from functools import reduce

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [2]:
h = [3, 4, 5]
feature = 100
p = 0.5
s = 3
batch_size = 50
k = 300

In [3]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
# read MR dataset
print("loading data...", end=' ')
files = ["rt-polarity.neg", "rt-polarity.pos"]
revs = []
vocab = defaultdict(float)
max_l = 0
for i in range(2):
    with open(files[i], "r", encoding="cp1252") as f:
        for line in f:
            orig_rev = clean_str(line.strip())
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum  = {"y":i,
                    "text": orig_rev}
            if len(orig_rev.split()) > max_l:
                max_l = len(orig_rev.split())
            revs.append(datum)
print("data loaded!")

print("number of sentences: " + str(len(revs)))             # 10662
print("vocab size: " + str(len(vocab)))                     # 18764
print("max sentence length: " + str(max_l))                 # 56

loading data... data loaded!
number of sentences: 10662
vocab size: 18764
max sentence length: 56


In [5]:
# read pre-trained word2vec
print("loading word2vec vectors...", end=' ')
word_vecs = {}
with open("GoogleNews-vectors-negative300.bin", "rb") as f:
    header = f.readline()
    vocab_size, _ = map(int, header.split())  # 3000000, 300
    binary_len = 4 * k
    for line in range(vocab_size):
        word = []
        while True:
            ch = f.read(1).decode("latin1")
            if ch == ' ':
                word = ''.join(word)
                break
            if ch != '\n':
                word.append(ch)
        if word in vocab:
            word_vecs[word] = torch.from_numpy(np.frombuffer(f.read(binary_len), dtype='float32'))
        else:
            f.read(binary_len)
print("word2vec loaded!")

print("num words already in word2vec: " + str(len(word_vecs)))    # 16448

loading word2vec vectors... word2vec loaded!
num words already in word2vec: 16448


In [6]:
# Embedding layer
embedding = nn.Embedding(len(vocab)+1, k, padding_idx=0)
W = {}                                                      # torch.Size([18765, 300])
word_idx_map = {}
W["rand"] = W["vec"] = embedding(torch.LongTensor(range(len(vocab)+1)))
for word, i in zip(vocab, range(1,len(vocab)+1)):
    if word in word_vecs:
        W["vec"][i] = word_vecs[word]
    word_idx_map[word] = i
print("dataset created!")

dataset created!


In [7]:
def get_idx_from_sent(sent, word_idx_map, max_l, k=300):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l:
        x.append(0)
    return x

In [8]:
non_static = [True, False, True]
U = ["rand", "vec", "vec"]
results = []
train_x_idx, train_y, test_x_idx, test_y = [], [], [], []
random.shuffle(revs)
for rev, i in zip(revs, range(len(revs))):
    sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k)
    if i < len(revs) / 10:
        test_x_idx.append(sent)
        test_y.append(rev["y"])
    else:
        train_x_idx.append(sent)
        train_y.append(rev["y"])
train_x_idx = torch.tensor(train_x_idx)
train_y = torch.tensor(train_y)
test_x_idx = torch.tensor(test_x_idx)
test_y = torch.tensor(test_y)

In [20]:
x = [W["rand"][idx] for idx in [sent for sent in train_x_idx]] # 9595 * 64 * 300
print(len(x), x[0].size())

train_x = torch.Tensor(train_x_idx.size()[0], 1, max_l, k)
print(train_x.size())

for i in range(len(x)):
    train_x[i][0] = x[i]
print(train_x.size())

train_x = train_x.reshape(train_x.size()[0], 1, max_l * k)
print(train_x.size())

x = [W["rand"][idx] for idx in [sent for sent in test_x_idx]]
test_x = torch.Tensor(test_x_idx.size()[0], 1, max_l, k)
for i in range(len(x)):
    test_x[i][0] = x[i]
test_x = test_x.reshape(test_x.size()[0], 1, max_l * k)

9595 torch.Size([56, 300])
torch.Size([9595, 1, 56, 300])
torch.Size([9595, 1, 56, 300])
torch.Size([9595, 1, 16800])


In [10]:
train = TensorDataset(train_x, train_y)
print(train[0])
train_loader = DataLoader(train, batch_size=batch_size)

(tensor([[-0.0287,  0.2812, -0.0518,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward>), tensor(0))


In [11]:
class CNN(nn.Module):
    def __init__(self, hs, feature, k, p):
        super(CNN, self).__init__()
        for h in hs:
            conv = nn.Conv1d(1, feature, h * k, stride=k)
            setattr(self, 'conv%d' % h, conv)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.drop = nn.Dropout(p)
        self.fc = nn.Linear(len(hs) * feature, 2)
        self.loss = nn.LogSoftmax(dim=-1)
        self.hs = hs
        
    def forward(self, x):
        outs = []
        for h in self.hs:
            conv = getattr(self, 'conv%d' % h)
            out = self.drop(self.relu(conv(x)))
            out = self.pool(out)
            outs.append(out)
        outs = torch.cat(outs, dim=1).reshape(-1, 300)
        outs = self.fc(outs)
        return self.loss(outs)
    
model = CNN(h, feature, k, p)

In [12]:
print(model.forward(train_x[:1]).size())
'''
torch.Size([1, 1, 16800])
torch.Size([1, 100, 54])
torch.Size([1, 100, 1])
torch.Size([1, 100, 53])
torch.Size([1, 100, 1])
torch.Size([1, 100, 52])
torch.Size([1, 100, 1])
torch.Size([1, 300])
torch.Size([1, 2])
torch.Size([1, 2])
'''

torch.Size([1, 2])


'\ntorch.Size([1, 1, 16800])\ntorch.Size([1, 100, 54])\ntorch.Size([1, 100, 1])\ntorch.Size([1, 100, 53])\ntorch.Size([1, 100, 1])\ntorch.Size([1, 100, 52])\ntorch.Size([1, 100, 1])\ntorch.Size([1, 300])\ntorch.Size([1, 2])\ntorch.Size([1, 2])\n'

In [14]:
# 오차함수 객체
criterion = nn.CrossEntropyLoss()

# 최적화를 담당할 객체
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 학습 시작
for epoch in range(10):
    total_loss = 0
    # 분할해 둔 데이터를 꺼내옴
    for train_x, train_y in train_loader:
        # 계산 그래프 구성
        train_x, train_y = Variable(train_x), Variable(train_y)
        # 경사 초기화
        optimizer.zero_grad()
        # 순전파 계산
        output = model(train_x)
        # 오차계산
        loss = criterion(output, train_y)
        # 역전파 계산
        loss.backward()
        # 가중치 업데이트
        optimizer.step()
        # 누적 오차 계산
        total_loss += loss.data
    print(epoch+1, total_loss)

1 tensor(115.9329)
2 tensor(113.1467)
3 tensor(109.7286)
4 tensor(105.8845)
5 tensor(103.2700)
6 tensor(100.6148)
7 tensor(97.4388)
8 tensor(94.7471)
9 tensor(92.1811)
10 tensor(91.2009)


In [21]:
# 계산 그래프 구성
test_x, test_y = Variable(test_x), Variable(test_y)
# 출력이 0 혹은 1이 되게 함
result = torch.max(model(test_x).data, 1)[1]
# 모형의 정확도 측정
accuracy = sum(test_y.data.numpy() == result.numpy()) / len(test_y.data.numpy())

# 모형의 정확도 출력
accuracy

0.7291471415182755