In [62]:
import numpy as np
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

In [18]:
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [22]:
print(reviews[:2])
print(type(reviews))
print(labels[:20])

['bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   ', 'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is

In [None]:
# reviews_split = []
# all_text = []
# for review in reviews:
#     for c in review:
#         all_text.extend([c for c in review if c not in punctuation])
#         reviews_split.append([c for c in review if c not in punctuation])
# print(all_text[:3000])

In [25]:
print(type(reviews))

<class 'list'>


In [30]:
reviews_split = reviews
print(reviews_split[0])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   


In [40]:
all_text = ' '.join(reviews_split)
words = all_text.split()

In [41]:
print(len(reviews_split))

25000


In [42]:
print(len(words))

6347388


In [43]:
print(words[:20])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life']


In [47]:
vocab_to_int = Counter(words)
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [48]:
print('Unique words: ', len((vocab_to_int)))
#打印第一条影评的数字编码
print('Tokenized review: \n', reviews_ints[:1])

Unique words:  74073
Tokenized review: 
 [[8, 2161, 107328, 163009, 545, 3246, 327192, 96352, 238, 23513, 336713, 4053, 12724, 46933, 15747, 9163, 66, 17374, 1659, 6628, 5134, 46933, 77, 327192, 12503, 4517, 93968, 336713, 82, 65, 1310, 10773, 135720, 2505, 73245, 8, 2161, 65361, 261, 107328, 9763, 206, 135720, 987, 9919, 107328, 77, 327192, 336713, 6, 135720, 260, 29, 336713, 66, 361, 21433, 14654, 11478, 3313, 4969, 11385, 468, 77, 9, 336713, 2, 145864, 336713, 3078, 669, 23978, 157, 10773, 145864, 336713, 66, 87623, 898, 164107, 11385, 361, 327192, 14182, 87623, 3167, 336713, 1658, 93968, 12047, 163009, 392, 119, 773, 135720, 127, 3728, 336713, 1659, 87623, 462, 18, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 23513, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 327192, 2161, 327192, 163009, 1829, 1868, 162, 87623, 4998, 5767, 135720, 44, 26789, 145864, 5686, 77, 327192, 392, 214, 135720, 8, 2161, 327192, 87623, 1178, 73245, 6675,

In [49]:
encoded_labels = np.array([1 if label=='POSITIVE' else 0 for label in labels])

为了使影评保持标准形状，还要执行一个预处理步骤。网络要求输入文本是标准大小，所以我们需要将影评变形为特定的长度。为了满足该要求，我们将完成两大步骤：

1.删除超长或超短的影评，即离群值 2.填充或截断剩余数据，使所有影评长度一样。

In [50]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 2633


In [51]:
print(len(review_lens))

1063


In [52]:
print('Number of reviews before removing outliers: ', len(reviews_ints))

non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25000
Number of reviews after removing outliers:  25000


对于很短和很长的影评，我们将通过填充或截断方式使影评保持特定长度。对于短于 seq_length 的影评，我们将用 0 填充它。对于长于 seq_length 的影评，我们将截取前 seq_length 个字词。
最终 features 数组应该为二维数组，行数等于影评数，列数等于指定的 seq_length。

In [53]:
def pad_features(reviews_ints, seq_length):
    features=np.zeros([len(reviews_ints),seq_length],dtype = int)#先初始化特征
    
    for i,review in enumerate(reviews_ints):
        features[i,-len(review):] = np.array(review)[:seq_length]#很好的赋值方法
    
    return features

In [57]:
seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)
print(len(reviews_ints[0]))
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

print(features[:30,-10:])

167
[[  2977    103 327192  16159 163009    229  73245  96352   3177  34081]
 [   135     24 164107     13     42  14654  26957   6679    136 327192]
 [336713      4  16803     23    167  30138  14654   9177  16159  30138]
 [327192 101872 101872   9158   1079   1368 336713    541   2382      4]
 [  2580  12724  87623  11478  96352  12047 107328   3739   1601 327192]
 [ 48208    222  44343  21560    861  20617    279  44343    989 327192]
 [  4445 164107   1152  30626  20617   9763  46933 163009   1310 327192]
 [  1110  22906   1246    983 145864  76000 107328  10659    146 327192]
 [ 42603      2 164107     53   3406   2588  11385   1191  10659 327192]
 [ 40155 107328    240   1132 164107    816 327192  42603    866  73245]
 [327192  20498 327192 327192 327192 101872 101872 336713     43 327192]
 [    10 327192  87623  12436  27731 135720    572  44125  73245 327192]
 [  2483 135720     93   4971 164107    734 163009    157   1866  11988]
 [327192   9763 145864 336713    769 107328    

准备好数据后，我们需要将数据拆分为训练集、验证集和测试集。

In [58]:
split_frac = 0.8
#训练集、验证集、测试集分割索引
train_id = int(len(features)*split_frac)
valid_id = (train_id+len(features))//2

#encoded_labels = np.array(encoded_labels)

train_x , train_y = features[:train_id] , encoded_labels[:train_id]
valid_x , valid_y = features[train_id:valid_id] , encoded_labels[train_id:valid_id]
test_x  , test_y  = features[valid_id:] , encoded_labels[valid_id:]


print(train_x.shape)
print(valid_x.shape)
print(test_x.shape)
type(train_y)

(20000, 200)
(2500, 200)
(2500, 200)


numpy.ndarray

DataLoader 和批处理
- 创建训练集、测试集和验证集后，再创建 DataLoader： 1.使用 TensorDataset 创建一种已知数据格式。TensorDataset 的参数包括输入数据集和目标数据集，并且第一个维度一样，然后创建一个数据集。 2.创建 DataLoader 并批处理训练、验证和测试张量数据集。

In [60]:
# Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [61]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[ 76000,   1484,  48208,  ...,  40155,  12047,  48208],
        [     0,      0,      0,  ...,   9163,    237,   6890],
        [     0,      0,      0,  ...,  16803,  34230,    158],
        ...,
        [ 87623,    296, 336713,  ...,    213,   8364,   5134],
        [     0,      0,      0,  ..., 101872,    279,    169],
        [     0,      0,      0,  ..., 164107,   4007, 327192]],
       dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 1], dtype=torch.int32)


层级结构： 
- 1.一个嵌入层：将字词标记（整数）转换为特定大小的嵌入。 
- 2.一个 LSTM 层级：由 hidden_state 大小和层级数量定义。
- 3.一个全连接输出层：将 LSTM 层级输出映射到期望的 output_size。
- 4.一个 S 型激活层：将所有输出转换为 0-1 之间的值；仅返回最后一个 S 型函数输出值作为网络的输出。

In [119]:
class SentimentRNN(nn.Module):
    #文本情感分析
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        #初始化模型
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # 定义所有层级
        self.embed = nn.Embedding(vocab_size,embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,n_layers,drop_prob,batch_first=True)
        self.dropput = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim,output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hidden):
        #前馈
        batch_size = x.shape[0]
        # 输出sigmoid值和隐藏层状态（下一个时间步利用）
        print(x.shape)
        x = self.embed(x)
        lstm_out,hidden = self.lstm(x,hidden)
        
        lstm_out = lstm_out.contiguous.view(-1,hidden_dim)
        out = self.fc(lstm_out)
        out = out.view(batch_size,-1)
        out = self.dropput(out)
        sig_out = self.sigmoid(out)
        sig_out = sig_out[:,-1]
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        # 为隐藏层状态和LSTM的结点状态初始化
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

实例化网络，定义超参数。
- vocab_size：词汇表的大小或输入（字词标记）的值范围。
- output_size：期望输出的大小；希望输出的类别分数数量（正面/负面）。
- embedding_dim：嵌入查询表的列数；嵌入大小。
- hidden_dim：隐藏层的 LSTM 单元数量。
- n_layers：网络中的 LSTM 层级数量。

In [116]:
vocab_size = len(vocab_to_int)
output_size = 2
embedding_dim = 200
hidden_dim = 256
n_layers = 3

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(74073, 200)
  (lstm): LSTM(200, 256, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (sig): Sigmoid()
)


损失函数为BCELoss，即二元交叉熵损失，对在 0-1 之间的单个值应用交叉熵损失。

In [117]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
epochs = 4

counter = 0
print_every = 100
clip=5 # gradient clipping

net.train()
# train for some number of epochs
for e in range(epochs):
    # 初始化隐藏层
    h = net.init_hidden(batch_size)

    # 加载器取数据
    for inputs, labels in train_loader:
        counter += 1

        # Creating new variables for the hidden state, otherwise
        #在整个训练历史中反向传播
        h = tuple([each.data for each in h])

        # 将累计的梯度清零
        net.zero_grad()
        print(inputs.shape)
        output, h = net(inputs, h)

        # 计算损失，执行反向传播
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

测试

In [None]:

test_losses = []
num_correct = 0

# 初始化隐藏层
h = net.init_hidden(batch_size)
#进入测试模式
net.eval()

for inputs, labels in test_loader:
    
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    
    output, h = net(inputs, h)
    
    
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # 根据输出概率判断类别
    pred = torch.round(output.squeeze())
    
    # 预测标签与实际标签对比
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


#测试损失求平均
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# 测试数据上的精度
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))