In [148]:
import torch
from torch import nn
import plotly.graph_objects as go
import torchkeras
import string, re
import torchtext

In [114]:
MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20

In [115]:
# 分词方法
tokenizer = lambda x: re.sub('[%s]' % string.punctuation, "", x).split(" ")


# 过滤掉低频词
def filterLowFreqWords(arr, vocab):
    arr = [[x if x < MAX_WORDS else 0 for x in example]
           for example in arr]
    return arr


TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=tokenizer, lower=True,
                                   fix_length=MAX_LEN, postprocessing=filterLowFreqWords)

LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

In [116]:
# 2,构建表格型dataset
# torchtext.data.TabularDataset可读取csv,tsv,json等格式
ds_train, ds_test = torchtext.legacy.data.TabularDataset.splits(
    path='../../Datasets/imdb', train='train.csv', test='test.csv', format='csv',
    fields=[('label', LABEL), ('text', TEXT)], skip_header=True)

In [117]:
# 3,构建词典
TEXT.build_vocab(ds_train)

In [118]:
# 4,构建数据管道迭代器
train_iter, test_iter = torchtext.legacy.data.Iterator.splits(
    (ds_train, ds_test),
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE)
)

In [119]:
#查看example信息
print(ds_train[0].text)
print(ds_train[0].label)
print('.' * 16)

# 查看词典信息
print(len(TEXT.vocab))
print('+' * 16)

#itos: index to string
# print(TEXT.vocab.itos)
print(TEXT.vocab.itos[0])
print(TEXT.vocab.itos[1])
print('-' * 16)

#stoi: string to index
print(TEXT.vocab.stoi['<unk>'])  #unknown 未知词
print(TEXT.vocab.stoi['<pad>'])  #padding 填充
print('=' * 16)

#freqs: 词频
print(TEXT.vocab.freqs['<unk>'])
print(TEXT.vocab.freqs['a'])
print(TEXT.vocab.freqs['good'])
print('*' * 16)

# 查看数据管道信息
# 注意有坑：text第0维是句子长度
for batch in train_iter:
    features = batch.text
    labels = batch.label
    print(features)
    print(labels)
    print(features.shape, '-->', labels.shape)
    break

['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', 'youll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'mebr', 'br', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'wordbr', 'br', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', '

In [121]:
# 将数据管道组织成torch.utils.data.DataLoader相似的features,label输出形式
class DataLoader:
    def __init__(self, data_iter):
        self.data_iter = data_iter
        self.length = len(data_iter)

    def __len__(self):
        return self.length

    def __iter__(self):
        # 注意：此处调整features为 batch first，并调整label的shape和dtype
        for batch in self.data_iter:
            yield (torch.transpose(batch.text, 0, 1),
                   torch.unsqueeze(batch.label.float(), dim=1))


dl_train = DataLoader(train_iter)
dl_test = DataLoader(test_iter)

for feature, label in dl_train:
    # print(feature)
    # print(label)
    print(feature.shape, '-->', label.shape)
    break

torch.Size([20, 200]) --> torch.Size([20, 1])


In [143]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #设置padding_idx参数后将在训练过程中将填充的token始终赋值为0向量
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=3, padding_idx=1)
        self.conv = nn.Sequential()
        self.conv.add_module("conv_1", nn.Conv1d(in_channels=3, out_channels=16, kernel_size=5))
        self.conv.add_module("pool_1", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_1", nn.ReLU())
        self.conv.add_module("conv_2", nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))
        self.conv.add_module("pool_2", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_2", nn.ReLU())
        self.dense = nn.Sequential()
        self.dense.add_module("flatten", nn.Flatten())
        self.dense.add_module("linear", nn.Linear(6144, 1))
        self.dense.add_module("sigmoid", nn.Sigmoid())

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.conv(x)
        y = self.dense(x)
        return y


model = Net()
model = torchkeras.Model(model)
model.summary(input_shape=(200,), input_dtype=torch.LongTensor)



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1               [-1, 200, 3]          30,000
            Conv1d-2              [-1, 16, 196]             256
         MaxPool1d-3               [-1, 16, 98]               0
              ReLU-4               [-1, 16, 98]               0
            Conv1d-5              [-1, 128, 97]           4,224
         MaxPool1d-6              [-1, 128, 48]               0
              ReLU-7              [-1, 128, 48]               0
           Flatten-8                 [-1, 6144]               0
            Linear-9                    [-1, 1]           6,145
          Sigmoid-10                    [-1, 1]               0
Total params: 40,625
Trainable params: 40,625
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.000763
Forward/backward pass size (MB): 0.287796
Params size (MB): 0.154972
E

In [145]:
# 准确率
def accuracy(y_pred, y_true):
    y_pred = torch.where(
        y_pred > 0.5,
        torch.ones_like(y_pred, dtype=torch.float32),
        torch.zeros_like(y_pred, dtype=torch.float32)
    )
    acc = torch.mean(1 - torch.abs(y_true - y_pred))
    return acc

In [147]:
# torchkeras

model.compile(
    loss_func=nn.BCELoss(),
    optimizer=torch.optim.Adagrad(model.parameters(), lr=0.02),
    metrics_dict={"accuracy": accuracy}
)
dfhistory = model.fit(20, dl_train, dl_val=dl_test, log_step_freq=200)

Start Training ...

{'step': 200, 'loss': 0.753, 'accuracy': 0.511}
{'step': 400, 'loss': 0.724, 'accuracy': 0.507}
{'step': 600, 'loss': 0.713, 'accuracy': 0.513}
{'step': 800, 'loss': 0.707, 'accuracy': 0.514}
{'step': 1000, 'loss': 0.703, 'accuracy': 0.52}
{'step': 1200, 'loss': 0.699, 'accuracy': 0.527}
{'step': 1400, 'loss': 0.696, 'accuracy': 0.535}
{'step': 1600, 'loss': 0.691, 'accuracy': 0.545}
{'step': 1800, 'loss': 0.686, 'accuracy': 0.554}
{'step': 2000, 'loss': 0.681, 'accuracy': 0.563}

 +-------+-------+----------+----------+--------------+
| epoch |  loss | accuracy | val_loss | val_accuracy |
+-------+-------+----------+----------+--------------+
|   1   | 0.681 |  0.563   |  0.631   |     0.65     |
+-------+-------+----------+----------+--------------+

{'step': 200, 'loss': 0.621, 'accuracy': 0.668}
{'step': 400, 'loss': 0.617, 'accuracy': 0.666}
{'step': 600, 'loss': 0.613, 'accuracy': 0.67}
{'step': 800, 'loss': 0.608, 'accuracy': 0.674}
{'step': 1000, 'loss': 0.6

In [153]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=dfhistory['loss'], name='loss',mode='lines+markers'))
fig.add_trace(go.Scatter(y=dfhistory['val_loss'], name='val_loss',mode='lines+markers'))
fig.add_trace(go.Scatter(y=dfhistory['accuracy'], name='accuracy',mode='lines+markers'))
fig.add_trace(go.Scatter(y=dfhistory['val_accuracy'], name='val_accuracy',mode='lines+markers'))
fig.show()

In [167]:
# 评估
model.evaluate(dl_test)

{'val_loss': 0.4404172195643187, 'val_accuracy': 0.8140000014305114}

In [166]:
y_hat = model.predict(dl_test)
y_hat = torch.where(y_hat>0.5,
                     torch.ones_like(y_hat, dtype=torch.float32),
                     torch.zeros_like(y_hat, dtype=torch.float32))
y_ture = [y for _,y in dl_test]
print(y_ture)

fig = go.Figure()
fig.add_trace(go.Scatter(y=y_hat, name='loss',mode='lines+markers'))
fig.add_trace(go.Scatter(y=dfhistory['val_loss'], name='val_loss',mode='lines+markers'))
fig.show()


[tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.]]), tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]), tensor([[0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],

[tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.]]), tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]), tensor([[0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],

[tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.]]), tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]), tensor([[0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],

[tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.]]), tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]), tensor([[0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],

[tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.]]), tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]), tensor([[0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],