In [76]:
import collections
import os
import random
import tarfile
import torch
import numpy as np
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm

'''
https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter10_natural-language-processing/10.7_sentiment-analysis-rnn
IMDB数据集：http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
'''

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
#加载数据集  文件->句子的list i.e ['i think you xx xx xx nice xx', 1]
def read_file(folder='train', data_root="/Users/jiang/OneDrive/学习/NLP/practice/3/aclImdb"):
    data = []
    for label in ['neg', 'pos']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data
  
train_data, test_data = read_file('train'), read_file('test')

100%|██████████| 12500/12500 [00:00<00:00, 18907.19it/s]
100%|██████████| 12500/12500 [00:00<00:00, 20979.81it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2612.48it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2653.14it/s]

['white chicks hold on, why couldn\'t they have dressed as black chicks, oh yeah, they wouldn\'t look different at all. can anyone give me one wayans movie where they haven\'t dressed up as ladies? don\'t be a menace doesn\'t count, jack white and michael costanza ghost wrote that (the other norton trio members acted as directors).<br /><br />in white chicks, there\'s never really any jokes. it\'s just the wayans acting like girls for 2 hours. there\'s no setups, no punchlines and no laughs. there is a lot of "i think i\'m gonna play some time crisis 3." at least for me there was (5 times to be exact).<br /><br />somebody has to tell kenan ivory, damon, marlon, shawn, damien (the only talented one), kim, rakeesha, george w., and osama bin wayans to stop making movies. its only hurting the o-zone layer.<br /><br />verdict 1/2* out of ****', 0]





In [32]:
#将句子划分成单个词、换成小写
def get_tokenized(data):
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(text) for text, _ in data]

#创建词典
def get_word2index(data):
    tokenizer_data = get_tokenized(data)
    counter = collections.Counter([tok for seq in tokenizer_data for tok in seq])
    return Vocab.Vocab(counter, min_freq=5)  #过滤掉出现次数少于5的词

word2index = get_word2index(train_data)

In [47]:
#将单词转化成小写，通过截断或者补0来将每条评论长度固定成500
def preprocess(data, word2index):
    max_l = 500  #每条评论长度不一， 将每条评论通过截断或者补0，使得长度变成500
    
    def pad(x):
        return x[:max_l] if len(x)>=max_l else x + [0] * (max_l-len(x))
    
    tokenizer_data = get_tokenized(data)
    features = torch.tensor([pad([word2index[word] for word in words]) for words in tokenizer_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels   #features:句子中的单词index的列表， labels：目标值

In [53]:
BATCH_SIZE = 64
#Data.TensorDataset:包装数据和目标张量的数据集
#Data.DataLoader
train_set = Data.TensorDataset(*preprocess(train_data, word2index))
test_set = Data.TensorDataset(*preprocess(test_data, word2index))
train_iter = Data.DataLoader(train_set, BATCH_SIZE, shuffle=True)
test_iter = Data.DataLoader(test_set, BATCH_SIZE)
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)


X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

In [69]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        #num_layers:rnn层数  bidirectional：双向rnn
        self.rnn = nn.LSTM(embed_size, hidden_size=num_hiddens, num_layers=num_layers, bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs.permute(1, 0))  #inputs：[BATCH_SIZE, seq_length] embeddings;[seq_length, BATCH_SIZE, embed_size]
        outputs, _ = self.rnn(embeddings)
        outputs = torch.cat((outputs[0], outputs[-1]), dim=-1)
        out = self.decoder(outputs)
        return out
'''
LSTM输入: input, (h_0, c_0)
input (seq_len, batch, input_size)
h_0 (num_layers * num_directions, batch, hidden_size):保存着batch中每个元素的初始化隐状态的Tensor
c_0 (num_layers * num_directions, batch, hidden_size): 保存着batch中每个元素的初始化细胞状态的Tensor

LSTM输出 output, (h_n, c_n)

'''

'\nLSTM输入: input, (h_0, c_0)\ninput (seq_len, batch, input_size)\nh_0 (num_layers * num_directions, batch, hidden_size):保存着batch中每个元素的初始化隐状态的Tensor\nc_0 (num_layers * num_directions, batch, hidden_size): 保存着batch中每个元素的初始化细胞状态的Tensor\n\nLSTM输出 output, (h_n, c_n)\n\n'

In [88]:
embed_size, num_hiddens, num_layers = 100, 100, 2
vocab_size = len(word2index)
EPOCH =5

model = BiLSTM(vocab_size, embed_size, num_hiddens, num_layers)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(train_iter):
        optimizer.zero_grad()
        preds = model(batch[0])
        
        loss = criterion(preds, batch[1])
        losses.append(loss.detach().numpy())
        loss.backward()
        optimizer.step()
    print('Epoch:', '%01d' % epoch, 'cost =', '{:.6f}'.format(np.mean(losses)))

Epoch: 0 cost = 0.824846
Epoch: 1 cost = 0.720414
Epoch: 2 cost = 0.697621
Epoch: 3 cost = 0.699350
Epoch: 4 cost = 0.691135


In [89]:
#Test
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

predict_sentiment(model, word2index, ['this', 'movie', 'is', 'so', 'great']) # positive
predict_sentiment(model, word2index, ['this', 'movie', 'is', 'so', 'bad']) # negative