In [1]:
'''
    1.输入和加载数据
    2.数据预处理
    3.构建训练和测试集
    4.构建模型
    5.预测
'''

'\n    1.输入和加载数据\n    2.数据预处理\n    3.构建训练和测试集\n    4.构建模型\n    5.预测\n'

In [3]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import json
import pickle
import numpy as np
import os
import random

#加载数据
intent_json_path = os.path.join(os.getcwd(), "intents.json")
with open(intent_json_path, 'r', encoding='utf-8') as f:
    intents = json.load(f)

print(intents)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
{'intents': [{'tag': 'greeting', 'patterns': ['Hi there', 'How are you', 'Is anyone there?', 'Hey', 'Hola', 'Hello', 'Good day'], 'responses': ['Hello, thanks for asking', 'Good to see you again', 'Hi there, how can I help?'], 'context': ['']}, {'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye', 'Nice chatting to you, bye', 'Till next time'], 'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'], 'context': ['']}, {'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', "That's helpful", 'Awesome, thanks', 'Thanks for helping me'], 'responses': ['Happy to help!', 'Any time!', 'My pleasure'], 'context': ['']}, {'tag': 'noanswer', 'patterns': [], 'responses': ["Sorry, can't understand you", 'Please give me more info', 'Not sure I understand'], 'context': ['']}, {'tag': 'options', 'patterns': ['How you c

In [5]:
#预测处理数据
lemmatizer = WordNetLemmatizer()
words = [] #所有不同单词
classes = [] #所有类别
classes_index = {} #类别对应的索引号
documents = [] #所有文档
ignore_words = ['?', '!'] #过滤特殊符号
index = 0
for intent in intents['intents']:
    for pattern in intent['patterns']:
        #分词
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        #doc:tag
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])
        if intent['tag'] not in classes_index.keys():
            classes_index[intent['tag']] = index
            index += 1
print('classes_index:{}'.format(classes_index))
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))
print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words",words)

#保存
words_path = os.path.join(os.getcwd(), "words.pkl")
classes_path = os.path.join(os.getcwd(), "classes.pkl")
classes_index_path = os.path.join(os.getcwd(), "classes_index.pkl")
with open(words_path, 'wb') as f_words:
    pickle.dump(words, f_words)
with open(classes_path, 'wb') as f_classes:
    pickle.dump(classes, f_classes)
with open(classes_index_path, 'wb') as f_classes_index:
    pickle.dump(classes_index, f_classes_index)



classes_index:{'greeting': 0, 'goodbye': 1, 'thanks': 2, 'options': 3, 'adverse_drug': 4, 'blood_pressure': 5, 'blood_pressure_search': 6, 'pharmacy_search': 7, 'hospital_search': 8}
47 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
88 unique lemmatized words ["'s", ',', 'a', 'adverse', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'can', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'give', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'module', 'nearby', 'next', 'nice', 'of', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 

In [71]:
#创建训练和测试集，这里采用词袋模型表示文档
training = []
for doc in documents:
    #词袋 
    pattern_words = doc[0] #文档的词
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    bag = [0] * len(words)
    for s in pattern_words:
        for i,w in enumerate(words):
            if w == s:
                bag[i] = 1 #词在词典中
    training.append([bag, classes_index[doc[1]]])
random.shuffle(training)
training = np.array(training)
train_doc = list(training[:,0])
train_target = list(training[:,1])
print("{},\n\n {}".format(train_doc,train_target))

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [73]:
#build model

import torch
import torch.nn as nn
import torch.nn.functional as F

print('train_doc len:{}'.format(len(train_doc[0])))
print('train_target len:{}'.format(len(classes)))

class classifyModel(nn.Module):
    
    def __init__(self):
        super(classifyModel, self).__init__()
        self.model = nn.Sequential(
                nn.Linear(len(train_doc[4]), 128),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(64, len(classes)))
    def forward(self, x):
            out = self.model(x)
            return out
        
model = classifyModel()

optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.9, nesterov=True)
criterion = nn.CrossEntropyLoss()

train_doc = torch.tensor(train_doc)
train_doc = train_doc.float()
train_target = torch.tensor(train_target)
#train_target = train_target.long()

print('{},{}'.format(train_doc.dtype, train_target.dtype))
            
for iter in range(300):
    out = model(train_doc)
    loss = criterion(out, train_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (iter+1) % 10 == 0:
            print ('iter [{}/{}], Loss: {:.4f}'.format(iter+1, 300, loss.item()))
            
model_path = os.path.join(os.getcwd(), "chatbot_model.h5")
torch.save(model.state_dict(), model_path)


train_doc len:88
train_target len:9
torch.float32,torch.int64
iter [10/300], Loss: 2.1598
iter [20/300], Loss: 2.0483
iter [30/300], Loss: 1.5746
iter [40/300], Loss: 1.1209
iter [50/300], Loss: 0.6671
iter [60/300], Loss: 0.3838
iter [70/300], Loss: 0.2423
iter [80/300], Loss: 0.1344
iter [90/300], Loss: 0.0532
iter [100/300], Loss: 0.0630
iter [110/300], Loss: 0.0119
iter [120/300], Loss: 0.0268
iter [130/300], Loss: 0.0799
iter [140/300], Loss: 0.0143
iter [150/300], Loss: 0.0473
iter [160/300], Loss: 0.0240




iter [170/300], Loss: 0.0510
iter [180/300], Loss: 0.0126
iter [190/300], Loss: 0.0149
iter [200/300], Loss: 0.0084
iter [210/300], Loss: 0.0323
iter [220/300], Loss: 0.0044
iter [230/300], Loss: 0.0086
iter [240/300], Loss: 0.0118
iter [250/300], Loss: 0.0100
iter [260/300], Loss: 0.0048
iter [270/300], Loss: 0.0075
iter [280/300], Loss: 0.0044
iter [290/300], Loss: 0.0081
iter [300/300], Loss: 0.0066
