In [1]:
import json
from nltk.stem.porter import PorterStemmer
import nltk
import numpy as np

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
with open('intents.json', 'r') as f:
    intents = json.load(f)
# print(intents)

In [4]:
all_words = []
tags = []
xy = []

In [5]:
def tokenizer(sentense):
    return nltk.word_tokenize(sentense)

In [6]:
stemmer = PorterStemmer()
def stem(word):
    return stemmer.stem(word.lower())

In [7]:
def bag_of_words(tokenized_sentense, all_words):
    tokenized_sentense = [stem(w) for w in tokenized_sentense]
    bag = np.zeros(len(all_words), dtype = np.float32)
    for idx, w, in enumerate(all_words):
        if w in tokenized_sentense:
            bag[idx] = 1.0
    return bag

In [8]:
for intent in intents['intents']:
    tag = intent['tag'] 
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenizer(pattern)
        all_words.extend(w)
        xy.append((w, tag))  
ignored_words = ['?', '!', '.', ',']
all_words = [stem(w) for w in all_words if w not in ignored_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags)) 
print(tags)

['admission', 'canteen', 'college intake', 'committee', 'computerhod', 'course', 'creator', 'document', 'event', 'extchod', 'facilities', 'fees', 'floors', 'goodbye', 'greeting', 'hod', 'hostel', 'hours', 'infrastructure', 'ithod', 'library', 'location', 'menu', 'name', 'number', 'placement', 'principal', 'ragging', 'random', 'salutaion', 'scholarship', 'sem', 'sports', 'swear', 'syllabus', 'task', 'thanks', 'uniform', 'vacation']


In [9]:
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [10]:
class ChatDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # dataset[idx]
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

In [11]:
batch_size = 8
hidden_size = 8 
output_size = len(tags)
input_size = len(X_train[0])
learning_rate = 0.001
num_epochs = 1000
dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

In [12]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        return out

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
model = Model(input_size, hidden_size, output_size)
print(model)

Model(
  (l1): Linear(in_features=251, out_features=8, bias=True)
  (l2): Linear(in_features=8, out_features=8, bias=True)
  (l3): Linear(in_features=8, out_features=39, bias=True)
  (relu): ReLU()
)


In [15]:
criterion = nn.CrossEntropyLoss()
optimaizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [16]:
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(device)

        outputs = model(words)
        loss = criterion(outputs, labels)

        optimaizer.zero_grad()
        loss.backward()
        optimaizer.step()

    if (epoch +1) % 100 == 0:
        print(f'epoch {epoch+1}/{num_epochs}, loss={loss.item():.4f}')
        
print(f'final loss, loss={loss.item():.4f}')
        

epoch 100/1000, loss=0.0098
epoch 200/1000, loss=0.0048
epoch 300/1000, loss=0.1502
epoch 400/1000, loss=0.1822
epoch 500/1000, loss=0.0000
epoch 600/1000, loss=0.0000
epoch 700/1000, loss=0.0000
epoch 800/1000, loss=0.0000
epoch 900/1000, loss=0.0000
epoch 1000/1000, loss=0.0000
final loss, loss=0.0000


In [17]:
data = {
    "model_state": model.state_dict(),
    "input_size": input_size,
    "output_size": output_size,
    "hidden_size": hidden_size,
    "all_words": all_words, 
    "tads": tags
}
file = "data.pth"
torch.save(data, file)
print(f'Тренировка модели завершена, файл сохранен в {file}')

Тренировка модели завершена, файл сохранен в data.pth


In [18]:
import random

In [19]:
model.eval()
bot_name = 'Buddy'
print('Давай поболтаем: для прекращение диалога напишите "stop"')
while True:
    sentense = input('You: ')
    if sentense == 'stop':
        break
    sentense = tokenizer(sentense)
    X = bag_of_words(sentense, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X)

    output = model(X)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents['intents']:
            if tag == intent['tag']:
                print(f'{bot_name}: {random.choice(intent["responses"])}')
    else:
        print(f'{bot_name}: Я не понимаю')


Давай поболтаем: для прекращение диалога напишите "stop"


You:  HI


Buddy: Hi there, how can I help?


You:  Could tell who is your creatir?


Buddy: Our university offers Information Technology, computer Engineering, Mechanical engineering,Chemical engineering, Civil engineering and extc Engineering.


You:  Ywo is your master?


Buddy: Hi there, how can I help?


You:  Hwo created you?


Buddy: College students


You:  what is your name&


Buddy: I am a Chatbot.


You:  Nice to meet you


Buddy: welcome, anything else i can assist you with?


You:  no, thank you


Buddy: You can contact at: NUMBER


You:  thanks a lot


Buddy: I am glad I helped you


You:  Goodbye


Buddy: Goodbye!


You:  stop
