In [7]:
# -*- coding: utf-8 -*-

import pickle
import numpy as np
import os
import random

intent_classification_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# 训练数据路径
train_data = os.path.join(intent_classification_path,'classification_data/classification_segment_data.txt')

# 类别与索引号
class_index_data = os.path.join(intent_classification_path, 'classification_data/question_classification.txt')

# 所有不同单词
words = []
# 所有类别
classes = []
# 类别对应的索引号
classes_index = {}
# 所有文档
documents = []

with open(class_index_data, 'r', encoding='utf-8') as f_read:
    for line in f_read:
        line = line.strip()
        tokens = line.split(":")
        classes.append(tokens[1])
        classes_index[int(tokens[0])] = tokens[1]

with open(train_data, 'r', encoding='utf-8') as f_read:
    for line in f_read:
        line = line.strip()
        tokens = line.split(',')
        doc_words = tokens[1].split(' ')
        words.extend(doc_words)
        documents.append((doc_words, int(tokens[0])))

words = sorted(list(set(words)))
classes = sorted(list(set(classes)))

print('classes_index:{}'.format(classes_index))
print("####################")
print(len(documents), "documents")
print("####################")
print(len(classes), "classes", classes)
print("####################")
print(len(words), "unique words",words)

# 保存相关数据
words_path = os.path.join(os.getcwd(), 'words.pkl')
classes_path = os.path.join(os.getcwd(), 'classes.pkl')
classes_index_path = os.path.join(os.getcwd(), 'classes_index.pkl')
with open(words_path, 'wb') as f_words, open(classes_path, 'wb') as f_classes, open(classes_index_path, 'wb') as f_classes_index:
    pickle.dump(words, f_words)
    pickle.dump(classes, f_classes)
    pickle.dump(classes_index, f_classes_index)
    print('save data done!')

classes_index:{0: 'nm 评分', 1: 'nm 上映时间', 2: 'nm 类型', 3: 'nm 简介', 4: 'nm 演员列表', 5: 'nnt 介绍', 6: 'nnt ng电影作品', 7: 'nnt 电影作品', 8: 'nnt 参演评分大于 x', 9: 'nnt 参演评分小于 x', 10: 'nnt 电影类型', 11: 'nnt nnr合作电影列表', 12: 'nnt 电影数量', 13: 'nnt 出生日期', 14: '评分大于x电影', 15: '评分大于x的ng类型电影'}
####################
160 documents
####################
16 classes ['nm 上映时间', 'nm 演员列表', 'nm 简介', 'nm 类型', 'nm 评分', 'nnt ng电影作品', 'nnt nnr合作电影列表', 'nnt 介绍', 'nnt 出生日期', 'nnt 参演评分大于 x', 'nnt 参演评分小于 x', 'nnt 电影作品', 'nnt 电影数量', 'nnt 电影类型', '评分大于x电影', '评分大于x的ng类型电影']
####################
92 unique words ['ng', 'ng电影', 'nm', 'nnt', 'x', '一', '一起', '上映', '上线', '与', '中', '主演', '主要', '了', '人', '什么', '介绍', '以上', '以下', '以前', '低于', '信息', '共', '内容', '出演', '出生', '出生于', '出生日期', '分', '分在', '分数', '剧情', '剧情简介', '参', '参演', '可以', '合作', '合拍', '和', '哪一天', '哪些', '在', '多少', '大于', '小于', '影评', '影都', '影院', '得了', '情节', '放映', '故事', '故事梗概', '数量', '时候', '时间', '明星', '是', '有', '有哪些', '格调', '梗概', '演', '演员', '演过', '生日', '由', '电', '电影', '的', '的有哪些', '看到', '简

In [8]:
#创建训练和测试集(由于数据是不是过少，只训练了)，这里采用词袋模型表示文档
training = []
for doc in documents:
    #词袋 
    line_words = doc[0] #文档的词
    bag = [0] * len(words)
    for s in line_words:
        for i,w in enumerate(words):
            if w == s:
                bag[i] = 1 #词在词典中
    training.append([bag, doc[1]])
random.shuffle(training)
training = np.array(training)
train_doc = list(training[:,0])
train_target = list(training[:,1])
print("{},\n\n {}".format(train_doc,train_target))


[[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
# 建立模型并训练

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(os.getcwd()+'/log', comment='feedforward_network')

print('train_doc len:{}'.format(len(train_doc)))
print('train_target len:{}'.format(len(classes)))

class classifyModel(nn.Module):
    
    def __init__(self):
        super(classifyModel, self).__init__()
        self.model = nn.Sequential(
                nn.Linear(len(train_doc[4]), 128),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(64, len(classes)))
    def forward(self, x):
            out = self.model(x)
            return out
        
model = classifyModel()

optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.9, nesterov=True)
criterion = nn.CrossEntropyLoss()

train_doc = torch.tensor(train_doc)
train_doc = train_doc.float()
train_target = torch.tensor(train_target)
#train_target = train_target.long()

print('{},{}'.format(train_doc.dtype, train_target.dtype))
           
for iter in range(300):
    out = model(train_doc)
    loss = criterion(out, train_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (iter+1) % 10 == 0:
            print ('iter [{}/{}], Loss: {:.4f}'.format(iter+1, 300, loss.item()))
    writer.add_graph(model, input_to_model=train_doc,verbose=False)
    writer.add_scalar('loss',loss.item(),global_step=iter+1)
writer.flush()
writer.close()
            
model_path = os.path.join(os.getcwd(), "chatbot_model.h5")
torch.save(model.state_dict(), model_path)

train_doc len:160
train_target len:16
torch.float32,torch.int64
iter [10/300], Loss: 2.7120
iter [20/300], Loss: 2.5706
iter [30/300], Loss: 2.1623
iter [40/300], Loss: 1.6695
iter [50/300], Loss: 1.3257
iter [60/300], Loss: 1.0526
iter [70/300], Loss: 0.9323
iter [80/300], Loss: 0.6981
iter [90/300], Loss: 0.5489
iter [100/300], Loss: 0.4058
iter [110/300], Loss: 0.3560
iter [120/300], Loss: 0.2767
iter [130/300], Loss: 0.2622
iter [140/300], Loss: 0.2077
iter [150/300], Loss: 0.2357
iter [160/300], Loss: 0.1447
iter [170/300], Loss: 0.1468
iter [180/300], Loss: 0.1279
iter [190/300], Loss: 0.1201
iter [200/300], Loss: 0.1189
iter [210/300], Loss: 0.0769
iter [220/300], Loss: 0.0841
iter [230/300], Loss: 0.0868
iter [240/300], Loss: 0.0838
iter [250/300], Loss: 0.0428
iter [260/300], Loss: 0.0820
iter [270/300], Loss: 0.1336
iter [280/300], Loss: 0.0824
iter [290/300], Loss: 0.0719
iter [300/300], Loss: 0.0791
