In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable
import tqdm
import os
import time
import re
import pandas as pd
import string
import gensim
import time
import random
import snowballstemmer
import collections
from collections import Counter
from nltk.corpus import stopwords
from itertools import chain
from sklearn.metrics import accuracy_score

In [2]:
# 数据下载地址：http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# 读数据
def get_data():
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for seg in ['train', 'test']:
        for label in ['pos', 'neg']:
            path = os.path.join('./data', seg, label)
            files = os.listdir(path)
            for file in files:
                with open(os.path.join(path, file)) as line:
                    if seg == 'train':
                        X_train.append(line.read().replace('\n', ' '))
                        if label == 'pos':
                            y_train.append(1)
                        else:
                            y_train.append(0)
                    else:
                        X_test.append(line.read().replace('\n', ' '))
                        if label == 'pos':
                            y_test.append(1)
                        else:
                            y_test.append(0)
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = get_data()

In [3]:
X_train[:5]

["Surely one of the mysteries of the modern world!! - this film is NOT considered to be within the top 100 films of all time????<br /><br />If you watched this film and thought it was anything other than wonderful please let me know how? - Al Pacino's performance is as good as it gets!",
 "After what was considered to be the official Dirty Harry trilogy with The Enforcer(1976) to be the final chapter in the series. Dirty Harry is back, older, more dirtier and grittier than ever since the original 1971 classic.<br /><br />Dirty Harry in the past has killed a psychopath killer, vigilante cops, and Vietnam veteran terrorists. But now he's after a new killer, a killer who wants payback, by gunning down her attackers.<br /><br />Sudden Impact brings a new meaning and more darker tone to Dirty Harry. Callahan is on a new murder case that is circling back to a woman(played by Sondra Locke), who was brutally raped, along with her sister, who is left traumitized. Ten years after, she's out for 

In [4]:
# 对数据进行分词
X_train = [[data.lower() for data in text.split(' ')] for text in X_train]
X_test = [[data.lower() for data in text.split(' ')] for text in X_test]

In [5]:
# 使用itertools的chain工具，将所有的词整合成一个迭代器，然后存入到set中
# *的作用是将X_train解包，chain接受到多个list
vocab = set(chain(*X_train))
vocab_size = len(vocab)

In [6]:
vocab_size

252192

In [7]:
word_to_idx = {word: index+2 for index, word in enumerate(vocab)}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1
idx_to_word = {value: key for key, value in word_to_idx.items()}
vocab_size += 2

In [8]:
idx_to_word[0]

'<PAD>'

In [9]:
temps = []
for sample in X_train:
    temp = []
    for data in sample:
        temp.append(word_to_idx[data])
    for _ in range(200-len(temp)):
        temp.append(0)
    temps.append(temp[:200])
X_train = temps

In [10]:
temps = []
for sample in X_test:
    temp = []
    for data in sample:
        if data in word_to_idx:
            temp.append(word_to_idx[data])
        else:
            temp.append(1)
    for _ in range(200-len(temp)):
        temp.append(0)
    temps.append(temp[:200])
X_test = temps

In [11]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

In [12]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, label_size):
        super(LSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,
                           hidden_size=hidden_size,
                           num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, label_size)
        
    def forward(self, inputs):
        embeddings = self.embeddings(inputs)  # [batch_size, seq_len, embedded_size]
        encoder, (h_n, c_n) = self.encoder(embeddings.permute([1, 0, 2]))  # [seq_len, batch_size, embedded_size]
        outputs = self.decoder(encoder[-1, :, :])
        return outputs

In [13]:
lstm = LSTM(vocab_size=vocab_size,
            embedding_dim=100, 
            hidden_size=128, 
            num_layers=2, 
            label_size=2)
print(lstm)

LSTM(
  (embeddings): Embedding(252194, 100)
  (encoder): LSTM(100, 128, num_layers=2)
  (decoder): Linear(in_features=128, out_features=2, bias=True)
)


In [14]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(lstm.parameters(), lr=1e-5)

In [15]:
train_set = torch.utils.data.TensorDataset(X_train, y_train)
test_set = torch.utils.data.TensorDataset(X_test, y_test)
train_iter = torch.utils.data.DataLoader(train_set, batch_size=100, shuffle=True)

In [16]:
for epoch in range(20):
    for feature, label in train_iter:
        lstm.zero_grad()
        pre = lstm(feature)
        loss = loss_function(pre, label)
        loss.backward()
        optimizer.step()
        print('train: ', loss)
        test_loss = loss_function(lstm(X_test), y_test)
        print('test: ', test_loss)

train:  tensor(0.6927, grad_fn=<NllLossBackward>)
test:  tensor(0.6934, grad_fn=<NllLossBackward>)
train:  tensor(0.6924, grad_fn=<NllLossBackward>)


RuntimeError: $ Torch: not enough memory: you tried to allocate 1GB. Buy new RAM! at /pytorch/aten/src/TH/THGeneral.cpp:201