In [366]:
import os
import jieba
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset
from NNUtils.torchwu import torchwu, cuda, metrics

In [367]:
seq_length = 100  # 句子长度
BATCH_SIZE = 50
EPOCHS = 20

In [368]:
PROJECT_ROOT_PATH = os.path.join(os.path.abspath('.'), '..', '..', '..')

# 使用jieba进行分词
f = open('new_wangfeng.txt', 'r', encoding='utf-8')
all_str = f.read().replace('\n', '').replace(' ', '')  # 去除空格
f.close()
cut_list = jieba.cut(all_str)
seg_list = []  # 分词后的文本数据
for c in cut_list:
    seg_list.append(c)

In [369]:
# 生成one-hot
vocab = sorted(list(set(seg_list)))
word_to_int = dict((w, i) for i, w in enumerate(vocab))
int_to_word = dict((i, w) for i, w in enumerate(vocab))

n_words = len(seg_list)  # 总词量
n_vocab = len(vocab)  # 词表长度
print('总词汇量：', n_words)
print('词表长度：', n_vocab)

总词汇量： 22439
词表长度： 2840


In [370]:

dataX = []
dataY = []
for i in range(0, n_words - seq_length, 1):
    seq_in = seg_list[i:i + seq_length + 1]
    dataX.append([word_to_int[word] for word in seq_in])
# 乱序
np.random.shuffle(dataX)
for i in range(len(dataX)):
    dataY.append([dataX[i][seq_length]])
    dataX[i] = dataX[i][:seq_length]

n_simples = len(dataX)
print('样本数：', n_simples)

样本数： 22339


In [371]:
X = torch.tensor(dataX, dtype=torch.long).reshape((-1, seq_length))
Y = torch.tensor(dataY, dtype=torch.float).reshape((-1, 1))
Y_onehot = torch.zeros(Y.shape[0], int(torch.max(Y).item()) + 1)
Y_onehot[torch.arange(Y.shape[0]), Y[:, 0].long()] = 1
print(Y_onehot)
Y = Y_onehot
print('Toatal: ', X.shape, '-->', Y.shape)

ds = TensorDataset(X, Y)
dl = DataLoader(ds, batch_size=BATCH_SIZE, num_workers=0)
x, y = next(iter(dl))
print('1Batch: ', x.shape, '-->', y.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Toatal:  torch.Size([22339, 100]) --> torch.Size([22339, 2840])
1Batch:  torch.Size([50, 100]) --> torch.Size([50, 2840])


In [372]:
# 定义模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # (batch, seq_len)
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=3, padding_idx=1)
        # (batch, seq_len, features)
        self.lstm = nn.LSTM(input_size=3, hidden_size=60, num_layers=3, batch_first=True)
        # (batch, out_features)
        self.fc = nn.Linear(in_features=60, out_features=2840)
        # (batch, 2840)

    def forward(self, x):
        # print(x.shape)
        x = self.embedding(x)
        # print(x.shape)
        # x is input, size (batch_size, seq_len, input_size)
        x, _ = self.lstm(x)
        # x is output, size (batch_size, seq_len, hidden_size)
        x = x[:, -1, :]
        # print(x.shape)
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        # x = x.view(-1, 1)
        # print(x)
        return x


model = torchwu.Model(Net())
model.summary(input_shape=(seq_length,), input_dtype=torch.LongTensor, batch_size=BATCH_SIZE)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1               [50, 100, 3]           8,520
              LSTM-2              [50, 100, 60]          74,160
            Linear-3                 [50, 2840]         173,240
Total params: 255,920
Trainable params: 255,920
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.019073
Forward/backward pass size (MB): 3.486633
Params size (MB): 0.976257
Estimated Total Size (MB): 4.481964
----------------------------------------------------------------


In [373]:
DEVICE = cuda.try_gpu()
model = torchwu.Model(Net())
# model.summary(input_shape=(seq_length, 1))
model.compile(
    loss_func=F.cross_entropy,
    optimizer=torch.optim.Adam(model.parameters(), lr=1e-3),
    device=DEVICE,
    # metrics_dict={'acc': metrics.accuracy_multi_clf}
)
dfhistory = model.fit(
    epochs=EPOCHS,
    dl_train=dl,
    log_step_freq=300
)

[2022-04-10 23:50:10] Start Training ...
[2022-04-10 23:50:14] epoch=1/20, step=300/447, loss=7.828, 
[2022-04-10 23:50:16] epoch=1/20, loss=7.818, 
[2022-04-10 23:50:20] epoch=2/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:22] epoch=2/20, loss=7.798, 
[2022-04-10 23:50:25] epoch=3/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:27] epoch=3/20, loss=7.798, 
[2022-04-10 23:50:31] epoch=4/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:33] epoch=4/20, loss=7.798, 
[2022-04-10 23:50:37] epoch=5/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:38] epoch=5/20, loss=7.798, 
[2022-04-10 23:50:43] epoch=6/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:44] epoch=6/20, loss=7.798, 
[2022-04-10 23:50:48] epoch=7/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:50] epoch=7/20, loss=7.798, 
[2022-04-10 23:50:54] epoch=8/20, step=300/447, loss=7.798, 
[2022-04-10 23:50:55] epoch=8/20, loss=7.798, 
[2022-04-10 23:50:59] epoch=9/20, step=300/447, loss=7.798, 
[2022-04-10 23:51:01] epoch=9/20, 

KeyboardInterrupt: 

In [None]:
state_dict = model.state_dict()
torch.save(model.state_dict(), 'net_para.pkl')
print(state_dict.keys())

In [None]:
state_dict_load = torch.load('net_para.pkl')
model_clone = torchwu.Model(Net())
model_clone.load_state_dict(state_dict_load)

In [None]:
y_h1 = model.predict(dl)
y_h2 = model.predict(dl)
print(y_h1)
print(y_h2)

import d2l.torch as d2l

d2l.train_ch8()