In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import numpy as np
import os
import json
import random
    
words_path = os.path.join(os.getcwd(), "words.pkl")
with open(words_path, 'rb') as f_words:
    words = pickle.load(f_words)
    
# 构建分类模型
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, filter_num=100, filter_size=(3,4,5), dropout=0.5):
        '''
        vocab_size:词典大小
        embedding_dim:词维度大小
        output_size:输出类别数
        filter_num:卷积核数量
        filter_size(3,4,5):三种卷积核，size为3,4,5，每个卷积核有filter_num个，卷积核的宽度都是embedding_dim
        '''
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # conv2d(in_channel,out_channel,kernel_size,stride,padding),stride默认为1，padding默认为0
        self.convs = nn.ModuleList([nn.Conv2d(1, filter_num,(k, embedding_dim)) for k in filter_size])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(filter_num * len(filter_size), output_size)

    '''
    以下forward中的卷积和池化计算方式如下：

    1.卷积
    卷积后的shape公式计算简化为:np.floor((n + 2p - f)/s + 1)
    输入shape:(batch, in_channel, hin, win) = (163, 1, 20, 300)，20为句子长度，300为embedding大小
    输出shape:
    hout=(20 + 2 * 0 - 1 * (3 - 1) - 1)/1 + 1 = 18
    wout=(300 + 2 * 0 - 1 * (300 - 1) -1)/1 + 1 = 1
    =>
    output:(batch, out_channel, hout, wout) = (163, 100, 18, 1)

    2.max_pool1d池化
    简化公式：np.floor((l + 2p - f)/s + 1)
    输入shape:(N,C,L):(163, 100, 18, 1) -> squeeze(3) -> (163, 100, 18)
    输出shape:
    lout = (18 + 2*0 - 18)/18 +1 = 1 -> (163, 100, 1)
    '''
    def forward(self, x):
        # x :(batch, seq_len) = (163, 20)
        x = self.embedding(x) # [batch,word_num,embedding_dim] = [N,H,W] -> (163, 20, 300)
        x = x.unsqueeze(1) # [batch, channel, word_num, embedding_dim] = [N,C,H,W] -> (163, 1, 20, 300)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(filter_size) * (N, filter_num, H) -> 3 * (163, 100, 18)
        # MaxPool1d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False),stride默认为kernal_size
        x = [F.max_pool1d(output,output.shape[2]).squeeze(2) for output in x] # len(filter_size) * (N, filter_num) -> 3 * (163, 100)
        x = torch.cat(x, 1) # (N, filter_num * len(filter_size)) -> (163, 100 * 3)
        x = self.dropout(x)
        x = self.fc(x)
        return x
        
model = TextCNN(len(words), 300, 16)
model_path = os.path.join(os.getcwd(), "model.h5")
model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [12]:
from pyhanlp import HanLP

segment = HanLP.newSegment().enableCustomDictionaryForcing(True)


# 分词，需要将电影名，演员名和评分数字转为nm，nnt，ng
def sentence_segment(sentence):
    word_nature = segment.seg(sentence)
    print(word_nature)
    sentence_words = []
    for term in word_nature:
        if str(term.nature) == 'nnt':
            sentence_words.append('nnt')
        elif str(term.nature) == 'nm':
            sentence_words.append('nm')
        elif str(term.nature) == 'ng':
            sentence_words.append('ng')
        elif str(term.nature) == 'm':
            sentence_words.append('x')
        else:
            sentence_words.extend(list(term.word))
    print(sentence_words)
    return sentence_words

def bow(sentence, words, show_detail = True):
    sentence_words = sentence_segment(sentence)
    indexed = [words.stoi[t] for t in sentence_words]
    src_tensor = torch.LongTensor(indexed)
    src_tensor = src_tensor.unsqueeze(0)
    return src_tensor

def predict_class(sentence, model):
    sentence_bag = bow(sentence, words, False)
    model.eval()
    with torch.no_grad():
        outputs = model(sentence_bag)
    print('outputs:{}'.format(outputs))
    predicted_prob,predicted_index = torch.max(F.softmax(outputs, 1), 1)#预测最大类别的概率与索引
    print('softmax_prob:{}'.format(predicted_prob))
    print('softmax_index:{}'.format(predicted_index))
    results = []
    #results.append({'intent':index_classes[predicted_index.detach().numpy()[0]], 'prob':predicted_prob.detach().numpy()[0]})
    results.append({'intent':predicted_index.detach().numpy()[0], 'prob':predicted_prob.detach().numpy()[0]})
    print('result:{}'.format(results))
    return results
 
def get_response(predict_result):
    tag = predict_result[0]['intent']
    return tag

def predict(text):
    predict_result = predict_class(text, model)
    res = get_response(predict_result)
    return res
print(predict("成龙主演过的电影有哪些"))

[成龙/nnt, 主/ag, 演过/v, 的/ude1, 电影/n, 有/vyou, 哪些/ry]
['nnt', '主', '演', '过', '的', '电', '影', '有', '哪', '些']
outputs:tensor([[-5.6161, -1.6590, -3.5455, -2.9492,  2.9514, -2.8030,  3.4805,  9.1715,
          0.9180, -1.6946,  5.3676,  2.8006,  5.3537, -3.1495, -4.3906, -6.5853]])
softmax_prob:tensor([0.9509])
softmax_index:tensor([7])
result:[{'intent': 7, 'prob': 0.95088017}]
7
