<h1>Pre Processing data</h1>

In [1]:
import json
from datapreprocessing import tokenize,stemmingText,bagOfWords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kyada\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h3>opening our json file</h3>

In [2]:
with open('intents.json','r') as f:
    intents = json.load(f)
    
# print(intents)

In [3]:
# we need a training corpus
all_word = []
training_data = []
all_sentences = []
tags = []
for intent in intents['intents']:
    for sen in intent['patterns']:
        temp = tokenize(sen) #our tokenised sentence
        temp2 = stemmingText(temp) #after stemming
        all_word.extend(temp2)#final words are stemmed
        tags.append(intent['tag'])
        all_sentences.append(temp2)
        training_data.append((temp2,intent['tag']))
        
all_word = sorted(set(all_word))


# all_word is all set of unique words in our training corpus
# this will be used to create embedding using bag of words

# we also have our training data which has sentences and their corresponding tags 

In [4]:
tags = sorted(list(set(tags)))
print(tags)

#here we use our label encoding 
#0-goodbye
#1-thanks
#2-funny|
# etc

['delivery', 'funny', 'goodbye', 'greeting', 'items', 'payments', 'thanks']


<h1>For bag of words embedding</h1>

In [5]:
#creating our embedded data
import numpy as np

X_train = []
y_train = []

for (sen,tag) in training_data:
    temp = np.array(bagOfWords(sen,all_word))
    X_train.append(temp) #collection of numpy array
    y_train.append(tags.index(tag))
    
    
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train)
print(y_train)
#uptil here we are clear we have our bag of words basically embedded sentence and it's output tag whic would be passed in the neural network all stores as numpy array

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]]
[3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 6 6 6 6 6 6 6 4 4 4 4 4 4 4 5 5 5 5 5 5 5
 5 5 0 0 0 0 0 0 0 0 1 1 1]


<h1>For Tf-IDF embedding</h1>

In [6]:
#creating our embedded data
import numpy as np
from tfIdf import tf_idf

X_train = []
y_train = []

for (sen,tag) in training_data:
    y_train.append(tags.index(tag))
    
X_train = tf_idf(all_sentences,all_word)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train)
print(y_train)

# print(X_train[0])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.30546219 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.24436975 0.         0.         ... 0.         0.         0.11056839]]
[3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 6 6 6 6 6 6 6 4 4 4 4 4 4 4 5 5 5 5 5 5 5
 5 5 0 0 0 0 0 0 0 0 1 1 1]


<h1>Converting everything to pytorch dataset</h1>

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
batch_size = 8

In [9]:
#inherting Dataset class
class ChatDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # returns (features,tag) at index
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

dataset = ChatDataset()
#train_loader is our iterator of all dataset divided into groups of 8 batch_size
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

#train_loader class has automatically added a dimension to our numpy array and converted it into tensor 
#train loader has batches of 8 as dataset
    

<h1>Creating Model</h1>

In [10]:
from model import NeuralNet

In [11]:
# Hyper-parameters 
num_epochs = 1000
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
print(input_size, output_size)

89 7


In [12]:
model = NeuralNet(input_size,hidden_size,output_size)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
device

device(type='cuda')

In [14]:
#now we define our loss function and cross entropy
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [15]:
# Train the model
for epoch in range(num_epochs): #for all epochos
    for (words, labels) in train_loader: #for all batches
        # print((words,labels))
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        # print(words,labels)
        # Forward pass
        outputs = model.ffnn(words) #this is our output from the feed forward neural network
        # model
        # print(outputs)
        # print(labels)
        loss = criterion(outputs, labels)
        # we calculate the loss 
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


print(f'final loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 1.5414
Epoch [200/1000], Loss: 0.1567
Epoch [300/1000], Loss: 0.0371
Epoch [400/1000], Loss: 0.0026
Epoch [500/1000], Loss: 0.0032
Epoch [600/1000], Loss: 0.0017
Epoch [700/1000], Loss: 0.0006
Epoch [800/1000], Loss: 0.0001
Epoch [900/1000], Loss: 0.0001
Epoch [1000/1000], Loss: 0.0001
final loss: 0.0001


<h1>Saving the Model</h1>

In [18]:
data = {
    "model_state" : model.state_dict(),
    "input_size" : input_size,
    "output_size" : output_size,
    "hidden_size" : hidden_size,
    "all_words" : all_word,
    "tags" : tags
}

#this is all the data we want to store

In [19]:
File = "dataTFIDF.pth"
torch.save(data, File)
print(f'training complete.file save to {File}')

training complete.file save to dataTFIDF.pth
