In [1]:
# 博文链接https://blog.csdn.net/qq_35687547/article/details/102172775


import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)  # 为CPU设置随机种子
torch.cuda.manual_seed(SEED)  #为GPU设置随机种子
# 在程序刚开始加这条语句可以提升一点训练速度，没什么额外开销
torch.backends.cudnn.deterministic = True

# 首先，我们要创建两个Field 对象：这两个对象包含了我们打算如何预处理文本数据的信息。
# spaCy:英语分词器,类似于NLTK库，如果没有传递tokenize参数，则默认只是在空格上拆分字符串。
# torchtext.data.Field : 用来定义字段的处理方法（文本字段，标签字段）
TEXT = data.Field(tokenize='spacy')
#LabelField是Field类的一个特殊子集，专门用于处理标签。 
LABEL = data.LabelField(dtype=torch.float)

# 加载IMDB电影评论数据集
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL,root='/root/IMDB/')

In [2]:
print(vars(train_data.examples[0]))

{'text': ['Zentropa', 'has', 'much', 'in', 'common', 'with', 'The', 'Third', 'Man', ',', 'another', 'noir', '-', 'like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'Europe', '.', 'Like', 'TTM', ',', 'there', 'is', 'much', 'inventive', 'camera', 'work', '.', 'There', 'is', 'an', 'innocent', 'American', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', 'does', "n't", 'really', 'understand', ',', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>But', 'I', "'d", 'have', 'to', 'say', 'that', 'The', 'Third', 'Man', 'has', 'a', 'more', 'well', '-', 'crafted', 'storyline', '.', 'Zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect', '.', 'Perhaps', 'this', 'is', 'intentional', ':', 'it', 'is', 'presented', 'as', 'a', 'dream', '/', 'nightmare', ',', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect', '.', '<', 'br', '/><br', '/>This', 'movie', 'i

In [3]:
import random

# 默认split_ratio=0.7
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [4]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')


Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [5]:
# 从预训练的词向量（vectors）中，将当前(corpus语料库)词汇表的词向量抽取出来，构成当前 corpus 的 Vocab（词汇表）
# 预训练的 vectors 来自glove模型，每个单词有100维。glove模型训练的词向量参数来自很大的语料库
# 而我们的电影评论的语料库小一点，所以词向量需要更新，glove的词向量适合用做初始化参数。
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [6]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [7]:
print(LABEL.vocab.itos)

['neg', 'pos']


In [8]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [9]:
print(TEXT.vocab.stoi)



In [10]:
# 语料库单词频率越高，索引越靠前。前两个默认为unk和pad。
print(TEXT.vocab.itos)



In [11]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 204107), (',', 194270), ('.', 166464), ('a', 110139), ('and', 109993), ('of', 101553), ('to', 94337), ('is', 76827), ('in', 61595), ('I', 54376), ('it', 53662), ('that', 49415), ('"', 44401), ("'s", 43781), ('this', 42212), ('-', 37510), ('/><br', 35656), ('was', 35057), ('as', 30614), ('with', 30210)]


In [12]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 相当于把样本划分batch，知识多做了一步，把相等长度的单词尽可能的划分到一个batch，不够长的就用padding。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)


In [13]:
next(iter(train_iterator)).label
next(iter(train_iterator)).text

tensor([[ 390,   66,   11,  ..., 3924,  390,  261],
        [1536,   19,   63,  ...,  927,  366,   51],
        [  76,  103,   47,  ...,  833,  961, 7589],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])

In [14]:
# 多运行一次可以发现一条评论的单词长度会变
next(iter(train_iterator))
next(iter(train_iterator)).text

tensor([[ 3714,   973,   273,  ...,    11,   637,    11],
        [   16,   853,   196,  ...,    19,     2,   458],
        [   23,   266,     7,  ...,  1321, 11801,    93],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])

In [15]:
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
    # 初始化参数
    super().__init__()
    
    # embedding的作用就是将每个单词变成一个词向量
    # vocab_size=词汇表长度，embedding_dim=每个单词的维度
    # padding_idx：如果提供的话，输出遇到此下标时用零填充。这里如果遇到padding的单词就用0填充。
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
    
    # output_dim输出的维度，一个数就可以了，=1
    self.fc = nn.Linear(embedding_dim, output_dim)
    
  def forward(self, text):  # text维度为(sent_len, 1)
    embedded = self.embedding(text)
    # text 下面会指定，为一个batch的数据
    # embedded = [sent_len, batch_size, emb_dim]
    # sen_len 一条评论的单词数
    # batch_size 一个batch有多少条评论
    # emb_dim 一个单词的维度
    # 假设[sent_len, batch_size, emb_dim] = (1000, 64, 100)
    # 则进行运算: (text: 1000, 64, 25000)*(self.embedding: 1000, 25000, 100) = (1000, 64, 100)
    
    # [batch_size, sent_len, emb_dim] 更换顺序
    embedded = embedded.permute(1, 0, 2)
    
    # [batch_size, embedding_dim]把单词长度的维度压扁为1，并降维
    # embedded 为input_size，(embedded.shape[1], 1)) 为kernel_size
    # squeeze(1)表示删除索引为1的那个维度
    pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
    
    # (batch_size, embedding_dim)*(embedding_dim, output_dim) = (batch_size, output_dim)
    return self.fc(pooled)


In [16]:
INPUT_DIM = len(TEXT.vocab)  # 25002
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

# PAD_IDX = 1 为pad的索引
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [17]:
print(PAD_IDX)

1


In [18]:
# 统计参数数量
def count_parameters(model):
  # numel()函数：返回数组中元素的个数
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 2,500,301 trainable parameters


In [19]:
# 把上面vectors="glove.6B.100d"取出的词向量作为初始化参数
# 数量为25000*100个参数，25000个单词，每个单词的词向量维度为100
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.6345,  0.5735, -0.3945,  ..., -0.4865,  0.1097,  0.5122],
        [ 0.6406,  0.3674,  0.8373,  ..., -0.1512, -0.0284, -0.1952],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304]])

In [20]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]  # UNK_IDX = 0

# 词汇表25002个单词，前两个unk和pad也需要初始化，把它们初始化为0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [21]:
model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.6345,  0.5735, -0.3945,  ..., -0.4865,  0.1097,  0.5122],
        [ 0.6406,  0.3674,  0.8373,  ..., -0.1512, -0.0284, -0.1952],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304]])

In [22]:
import torch.optim as optim

# 定义优化器
optimizer = optim.Adam(model.parameters())

# 定义损失函数，这个BCEWithLogitsLoss特殊情况，二分类损失函数
criterion = nn.BCEWithLogitsLoss()

# 送到GPU上去
model = model.to(device)
criterion = criterion.to(device)

In [23]:
# 计算预测的准确率

def binary_accuracy(preds, y):
  """
  Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
  """
  
  # .round函数 四舍五入，rounded_preds要么为0，要么为1
  # neg为0, pos为1
  rounded_preds = torch.round(torch.sigmoid(preds))
  
  # convert into float for division
  """
  a = torch.tensor([1, 1])
  b = torch.tensor([1, 1])
  print(a == b)
  output: tensor([1, 1], dtype=torch.uint8)
  
  a = torch.tensor([1, 0])
  b = torch.tensor([1, 1])
  print(a == b)
  output: tensor([1, 0], dtype=torch.uint8)
  """
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  
  return acc

In [24]:
def train(model, iterator, optimizer, criterion):
  
  epoch_loss = 0
  epoch_acc = 0
  total_len = 0
  
  # model.train()代表了训练模式
  # model.train() ：启用 BatchNormalization 和 Dropout
  # model.eval() ：不启用 BatchNormalization 和 Dropout
  model.train() 
  
  # iterator为train_iterator
  for batch in iterator:
    # 梯度清零，加这步防止梯度叠加
    optimizer.zero_grad()
    
    # batch.text 就是上面forward函数的参数text
    # 压缩维度，不然跟 batch.label 维度对不上
    predictions = model(batch.text).squeeze(1)
    
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    
    loss.backward()  # 反向传播
    optimizer.step() # 梯度下降
    
    # loss.item() 以及本身除以了 len(batch.label)
    # 所以得再乘一次，得到一个batch的损失，累加得到所有样本损失
    epoch_loss += loss.item() * len(batch.label)
    
    # (acc.item(): 一个batch的正确率) * batch数 = 正确数
    # train_iterator 所有batch的正确数累加
    epoch_acc += acc.item() * len(batch.label)
    
    # 计算 train_iterator 所有样本的数量，应该是17500
    total_len += len(batch.label)
  
  # epoch_loss / total_len ：train_iterator所有batch的损失
  # epoch_acc / total_len ：train_iterator所有batch的正确率
  return epoch_loss / total_len, epoch_acc / total_len


In [25]:
# 不用优化器了
def evaluate(model, iterator, criterion):
  
  epoch_loss = 0
  epoch_acc = 0
  total_len = 0
  
  # 转成测试模式，冻结dropout层或其他层
  model.eval() 
  
  with torch.no_grad():
    # iterator为valid_iterator
    for batch in iterator:
      
      # 没有反向传播和梯度下降
      
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)

      epoch_loss += loss.item() * len(batch.label)
      epoch_acc += acc.item() * len(batch.label)
      total_len += len(batch.label)
  
  
  # 调回训练模式
  model.train()
  
  return epoch_loss / total_len, epoch_acc / total_len


In [26]:
import time 

# 查看每个epoch的时间
def epoch_time(start_time, end_time):  
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [27]:
N_EPOCHS = 10

best_valid_loss = float('inf')  # 初试的验证集loss设置为无穷大

for epoch in range(N_EPOCHS):
  start_time = time.time()
  
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  # 只要模型效果变好，就存模型(参数)
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'wordavg-model.pt')
    
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 30s
	Train Loss: 0.685 | Train Acc: 57.15%
	 Val. Loss: 0.622 | Val. Acc: 72.31%
Epoch: 02 | Epoch Time: 0m 29s
	Train Loss: 0.644 | Train Acc: 71.05%
	 Val. Loss: 0.514 | Val. Acc: 75.59%
Epoch: 03 | Epoch Time: 0m 28s
	Train Loss: 0.574 | Train Acc: 78.35%
	 Val. Loss: 0.453 | Val. Acc: 79.76%
Epoch: 04 | Epoch Time: 0m 28s
	Train Loss: 0.502 | Train Acc: 82.86%
	 Val. Loss: 0.441 | Val. Acc: 82.76%
Epoch: 05 | Epoch Time: 0m 29s
	Train Loss: 0.440 | Train Acc: 85.82%
	 Val. Loss: 0.436 | Val. Acc: 84.61%
Epoch: 06 | Epoch Time: 0m 28s
	Train Loss: 0.388 | Train Acc: 87.75%
	 Val. Loss: 0.451 | Val. Acc: 85.93%
Epoch: 07 | Epoch Time: 0m 29s
	Train Loss: 0.348 | Train Acc: 89.17%
	 Val. Loss: 0.477 | Val. Acc: 86.59%
Epoch: 08 | Epoch Time: 0m 29s
	Train Loss: 0.315 | Train Acc: 90.14%
	 Val. Loss: 0.496 | Val. Acc: 87.08%
Epoch: 09 | Epoch Time: 0m 27s
	Train Loss: 0.291 | Train Acc: 90.85%
	 Val. Loss: 0.521 | Val. Acc: 87.45%
Epoch: 10 | Epoch Time: 0m 2

In [28]:
# 用保存的模型参数预测数据
model.load_state_dict(torch.load("wordavg-model.pt"))

In [29]:
# spacy是分词工具，跟NLTK类似
import spacy  
nlp = spacy.load('en')

def predict_sentiment(sentence):
  # 分词
  tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
  # sentence 的索引
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  
  tensor = torch.LongTensor(indexed).to(device)  # seq_len
  tensor = tensor.unsqueeze(1)   # seq_len * batch_size (1)
  
  # tensor与text一样的tensor
  prediction = torch.sigmoid(model(tensor))
  
  return prediction.item()

In [30]:
predict_sentiment("I love this film bad")

1.1693170563376043e-05

In [31]:
predict_sentiment("This film is great")

1.0

In [36]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.429 | Test Acc: 84.17%


In [37]:
def predict2(model, inputs):
    import json
    import torch
    import numpy as np
    result = []
    print(inputs)
    nums = len(inputs)
    for i in range(nums):
    
        inputs1 = json.loads(inputs[i])
        print('inputs',inputs)

        print(torch.__version__)
        print('i',i)
        pred = torch.from_numpy(np.array(inputs1[0], dtype=np.int))
        pred = model(pred)
        print('predict finish')
        pred = pred.data.numpy()
        result.append(str(pred))
        print('result',result)
    return result

In [50]:
from clipper_admin import ClipperConnection, DockerContainerManager
from clipper_admin.deployers.pytorch import deploy_pytorch_model
clipper_conn = ClipperConnection(DockerContainerManager())

In [51]:
clipper_conn.start_clipper()

20-07-17:03:25:41 INFO     [docker_container_manager.py:184] [default-cluster] Starting managed Redis instance in Docker
20-07-17:03:25:46 INFO     [docker_container_manager.py:276] [default-cluster] Metric Configuration Saved at /tmp/tmpl3o_zeuo.yml
20-07-17:03:25:47 INFO     [clipper_admin.py:162] [default-cluster] Clipper is running


In [52]:
clipper_conn.connect()

20-07-17:03:25:51 INFO     [clipper_admin.py:172] [default-cluster] Successfully connected to Clipper cluster at localhost:1337


In [53]:
model

WordAVGModel(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [54]:
clipper_conn.register_application(name="pytorch-app", input_type="strings", default_output="-1.0", slo_micros=100000)

20-07-17:03:25:55 INFO     [clipper_admin.py:236] [default-cluster] Application pytorch-app was successfully registered


In [55]:
deploy_pytorch_model(
    clipper_conn,
    name="pytorch-mod",
    version=1,
    input_type="strings",
    func=predict2,
    pytorch_model=model)

20-07-17:03:26:00 INFO     [deployer_utils.py:41] Saving function to /tmp/tmpsds5peynclipper
20-07-17:03:26:00 INFO     [deployer_utils.py:51] Serialized and supplied predict function
20-07-17:03:26:00 INFO     [pytorch.py:204] Torch model saved
20-07-17:03:26:00 INFO     [pytorch.py:218] Using Python 3.6 base image
20-07-17:03:26:00 INFO     [clipper_admin.py:534] [default-cluster] Building model Docker image with model data from /tmp/tmpsds5peynclipper
20-07-17:03:26:01 INFO     [clipper_admin.py:539] [default-cluster] Step 1/2 : FROM clipper/pytorch36-container:0.4.1
20-07-17:03:26:01 INFO     [clipper_admin.py:539] [default-cluster]  ---> e3c73c7ad6b9
20-07-17:03:26:01 INFO     [clipper_admin.py:539] [default-cluster] Step 2/2 : COPY /tmp/tmpsds5peynclipper /model/
20-07-17:03:26:01 INFO     [clipper_admin.py:539] [default-cluster]  ---> 597c20b3eeb7
20-07-17:03:26:01 INFO     [clipper_admin.py:539] [default-cluster] Successfully built 597c20b3eeb7
20-07-17:03:26:01 INFO     [clipp

In [56]:
clipper_conn.link_model_to_app(
    app_name="pytorch-app",
    model_name="pytorch-mod")

20-07-17:03:26:20 INFO     [clipper_admin.py:303] [default-cluster] Model pytorch-mod is now linked to application pytorch-app


In [57]:
query_address = clipper_conn.get_query_addr()
print(query_address)

localhost:1337


In [60]:
sentence = "This film  is vary great"
import spacy  
nlp = spacy.load('en')

tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
# sentence 的索引
indexed = [TEXT.vocab.stoi[t] for t in tokenized]

tensor = torch.LongTensor(indexed).to(device)  # seq_len
tensor1 = tensor.unsqueeze(1)   # seq_len * batch_size (1)
arr1 = tensor1.numpy()
X2 = arr1.tolist()
import json
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, time):
            return obj.__str__()
        else:
            return super(NpEncoder, self).default(obj)

tva = json.dumps([X2],cls=MyEncoder)


In [61]:
import requests, json, numpy as np
from datetime import datetime
headers = {"Content-type": "application/json"}
start = datetime.now()
resu = requests.post("http://"+query_address+"/pytorch-app/predict", headers=headers, data=json.dumps({
    "input": tva})).json()
end = datetime.now()
latency = (end - start).total_seconds() * 1000.0
print(resu,latency)

{'query_id': 1, 'output': [[21.752615]], 'default': False} 19.766


In [None]:
ten1 = torch.tensor([[44.937534]])
prediction = torch.sigmoid(ten1)
print(prediction.item())

In [63]:
clipper_conn.stop_all()

20-07-17:03:36:28 INFO     [clipper_admin.py:1424] [default-cluster] Stopped all Clipper cluster and all model containers


In [62]:
model

WordAVGModel(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)