# Decoder only transformer for auto regressive text generation

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import sentencepiece as spm
from sklearn.model_selection import train_test_split
import math
import re

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Configurations


In [None]:
batch_size = 128
max_seq_len = 128
block_size = 128
seed = 28
num_epochs = 10
vocab_size = 1000
d_model = 128
num_heads = 8
num_layers = 4
d_ff = 256
learning_rate = 5e-4
dropout = 0.3
weight_decay = 0.00001
grad_clip = 1.0

In [None]:
torch.random.manual_seed(seed)

<torch._C.Generator at 0x7810232ba8b0>

## Model Implementation

In [None]:
def pad_sequence(sequence, max_length):
  return  sequence + ([0] * (max_length - len(sequence)))

In [None]:
def create_causal_mask(seq_len, pad_idx=None):

  mask = torch.ones(seq_len, seq_len)
  upper_triangle = torch.triu(mask, diagonal=1)  # Create upper triangular matrix
  causal_mask = upper_triangle.to(device)

  if pad_idx is not None:
    # Mask out attention to and from padding tokens
    causal_mask[: , pad_idx:] = 1
    causal_mask[pad_idx:, :] = 1

  return causal_mask

In [None]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, Q, K, V, dk, mask=None):
    # dim of mask is (1, seq_len)
    # since we are choosing equal dims for linear projections or Q, K, and V
    # dk is dim for both queries and keys
    # dv is dim for values
    # dim of Queries (batch_size, num_heads, seq_len, d_model/num_heads)
    # dim of Keys (batch_size, num_heads, seq_len, d_model/num_heads)
    # dim of Values (batch_size, num_heads, seq_len, d_model/num_heads)
    Q_K_T = (Q @ torch.transpose(K, -2, -1)) * (1/torch.sqrt(torch.tensor(dk)))
    # dim of Q_K_T (batch_size, num_heads, seq_len, seq_len)
    if mask is not None:
      if mask.dim() == 2:
        mask = mask.unsqueeze(1).unsqueeze(2)
      Q_K_T += (mask * float('-1e20'))
    weights = torch.softmax(Q_K_T, dim=-1)
    # dim of weights (batch_size, num_heads, seq_len, seq_len)
    return weights @ V

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, d_model):
    super().__init__()
    self.dk, self.dv = d_model//num_heads, d_model//num_heads
    self.num_heads = num_heads
    self.q_projection = nn.Linear(d_model, self.dk * self.num_heads, bias=False)
    self.k_projection = nn.Linear(d_model, self.dk * self.num_heads, bias=False)
    self.v_projection = nn.Linear(d_model, self.dv * self.num_heads, bias=False)
    self.scaled_dot_product_attention = ScaledDotProductAttention()
    self.final_projection = nn.Linear(d_model, d_model)

  def forward(self, Q, K, V, mask=None):
    # dim of Q, K, and V is (batch_size, seq_len, d_model)
    B, S, D = Q.shape
    Q = self.q_projection(Q).reshape(B, S, num_heads, self.dk).permute(0, 2, 1, 3)
    K = self.k_projection(K).reshape(B, S, num_heads, self.dk).permute(0, 2, 1, 3)
    V = self.v_projection(V).reshape(B, S, num_heads, self.dv).permute(0, 2, 1, 3)
    attention = self.scaled_dot_product_attention(Q, K, V, self.dk, mask).permute(0, 2, 1, 3).reshape(B, S, D)
    return self.final_projection(attention)

In [None]:
class PositionWiseFeedForwardNet(nn.Module):
  def __init__(self, d_model, d_ff):
    super().__init__()
    self.linear1 = nn.Linear(d_model, d_ff)
    self.linear2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()

  def forward(self, x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.linear2(x)
    return x

In [None]:
def get_word_embeddings_weights(vocab_size, d_model):
  return nn.init.xavier_uniform_(torch.empty(vocab_size, d_model)).to(device)

In [None]:
def get_position_embeddings_weights(max_seq_len, d_model):
  weights = torch.zeros(max_seq_len, d_model).to(device)
  for position in range(max_seq_len):
    for i in range(0, d_model, 2):
      weights[position, i] = torch.sin(torch.pow(10000.0, torch.tensor(((2 * i) / d_model))))
      weights[position, i+1] = torch.cos(torch.pow(10000.0, torch.tensor(((2 * i) / d_model))))
  return weights

In [None]:
class SingleDecoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super().__init__()
    self.multi_head_attention1 = MultiHeadAttention(num_heads, d_model)
    self.layerNorm1 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    # self.multi_head_attention2 = MultiHeadAttention(num_heads, d_model)
    # self.layerNorm2 = nn.LayerNorm(d_model)
    # self.dropout2 = nn.Dropout(dropout)
    self.pos_wise_ff = PositionWiseFeedForwardNet(d_model, d_ff)
    self.layerNorm3 = nn.LayerNorm(d_model)
    self.dropout3 = nn.Dropout(dropout)


  def forward(self, x, causal_mask=None):
    input = x
    seq_len = x.shape[1]
    x = self.multi_head_attention1(x, x, x, causal_mask)
    x = self.dropout1(x)
    x = self.layerNorm1(x + input)
    # input = x
    # x = self.multi_head_attention2(x, x, x, causal_mask)
    # x = self.dropout2(x)
    # x = self.layerNorm2(x + input)
    input = x
    x = self.pos_wise_ff(x)
    x = self.dropout3(x)
    x = self.layerNorm3(x + input)
    return x

In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, d_model, max_seq_len, num_layers, num_heads, d_ff, dropout, word_embedding_weights):
    super().__init__()
    self.word_embedding = nn.Embedding(vocab_size, d_model, _weight=word_embedding_weights)
    positional_embedding_weights = get_position_embeddings_weights(max_seq_len, d_model)
    self.position_embedding = nn.Embedding(max_seq_len, d_model, _weight=positional_embedding_weights)
    self.decoder_layers = nn.ModuleList([SingleDecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
    self.linear = nn.Linear(d_model, vocab_size)
    with torch.no_grad():
      self.linear.weight.copy_(word_embedding_weights)

  def forward(self, x, causal_mask):
    x = self.word_embedding(x) + self.position_embedding(torch.arange(x.shape[1]).to(device))
    for layer in self.decoder_layers:
      x = layer(x, causal_mask)
    x = self.linear(x)
    return x


In [None]:
class Transformer(nn.Module):
  def __init__(self, vocab_size, d_model, max_seq_len, num_layers, num_heads, d_ff, dropout):
    super().__init__()
    self.word_embedding_weights = get_word_embeddings_weights(vocab_size, d_model)
    self.decoder = Decoder(vocab_size, d_model, max_seq_len, num_layers, num_heads, d_ff, dropout, self.word_embedding_weights)

  def forward(self, x, causal_mask):
    x = self.decoder(x, causal_mask)
    return x

## Dataset Loading

In [None]:
# df = pd.read_csv('/content/solarsystem.csv', names=['question', 'answer'])

In [None]:
!unzip /content/pretraindataset.zip

Archive:  /content/pretraindataset.zip
replace text12.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
text = ""
filenames = ["text" + str(i) + ".txt" for i in range(1, 13)]
for filename in filenames:
    with open(filename, "r") as f:
        text += " " + f.read()

In [None]:
text1 = ""

with open('/content/database1.txt', 'r') as f:
  text = f.read()

In [None]:
text += "\n " + text1

In [None]:
text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)

In [None]:
with open("text00.txt", "w") as f:
    f.write(text)

In [None]:
# read the text file
with open("text00.txt", "r") as f:
    text = f.read()

## Creating the Tokenizer

In [None]:
token_model_path = 'token_model.model'

In [None]:
# with open("text.txt", "w") as f:
#     for question, answer in zip(df["question"].apply(lambda x: x.lower()).tolist(), df["answer"].apply(lambda x: x.lower()).tolist()):
#         f.write(question + "\n")
#         f.write(answer +"\n")

In [None]:
model_type = "bpe"

In [None]:
spm.SentencePieceTrainer.train(input = "/content/text00.txt", model_prefix = "/content/token_model", vocab_size=vocab_size, model_type=model_type)

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load(token_model_path)

True

In [None]:
# questions = df['question'].apply(lambda x: x.lower()).tolist()
# answers = df['answer'].apply(lambda x: x.lower()).tolist()

In [None]:
# total_text = " ".join(questions + answers)

In [None]:
# remove punctuation from the text
# total_text = [re.sub(r'[^\w\s]', '', text) for text in total_text.split()]

In [None]:
# word_to_id = {word: idx for idx, word in enumerate(set(total_text))}
# id_to_word = {idx: word for idx, word in enumerate(set(total_text))}

In [None]:
# def add_to_vocab(word):
#   if word not in word_to_id:
#     word_to_id[word] = len(word_to_id)
#     id_to_word[len(id_to_word)] = word

In [None]:
# add_to_vocab('<sep>')
# add_to_vocab('<pad>')
# add_to_vocab('<eos>')
# add_to_vocab('<unk>')

In [None]:
# vocab_size = len(word_to_id.keys())
# vocab_size

In [None]:
# encode = lambda x: [word_to_id[word] for word in x.split()]

In [None]:
# decode = lambda x: [id_to_word[idx] for idx in x]

In [None]:
encode = lambda x: sp.encode_as_ids(x)
decode = lambda x: sp.decode_ids(x)

In [None]:
encode("hello")

[703, 636]

In [None]:
sp.vocab_size()

1000

## Training


In [None]:
# df['q-a'] = df['question'].apply(lambda x: encode(" ".join([ re.sub(r'[^\w\s]', '', text) for text in x.lower().split()])) + [sep_id]) + df['answer'].apply(lambda x: encode(" ".join([ re.sub(r'[^\w\s]', '', text) for text in x.lower().split()])))

In [None]:
text



In [None]:
# prompt: divide the text data into list of sentences
q_a_corpus = nltk.sent_tokenize(text.lower())

In [None]:
q_a_corpus = [s.replace("\n", " ") for s in q_a_corpus]

In [None]:
q_a_corpus[403]

'the effects of these processes are not wellunderstood.'

In [None]:
# prompt: divide long sentences into list of small sentences where each small sentence should have atmax 10 words using overlapping segmentation

def split_into_sentences(text):
  sentences = []
  for sentence in nltk.sent_tokenize(text):
    words = sentence.split()
    if len(words) > 64:
      # Split the sentence into smaller sentences with at most 10 words
      num_splits = math.ceil(len(words) / 64)
      for i in range(num_splits):
        start = i * 64
        end = min((i + 1) * 64, len(words))
        sentences.append(" ".join(words[start:end]))
    else:
      sentences.append(sentence)
  return sentences

text = q_a_corpus[403]
split_sentences = split_into_sentences(text)
print(split_sentences)


['the effects of these processes are not wellunderstood.']


In [None]:
q_a_corpus = [split_into_sentences(s) for s in q_a_corpus]

In [None]:
q_a_corpus = [item for sublist in q_a_corpus for item in sublist]

In [None]:
len(q_a_corpus)

37131

In [None]:
q_a_corpus = [encode(s) for s in q_a_corpus]

In [None]:
# find max len
max_len = 0
for i in q_a_corpus:
  max_len = max(max_len, len(i))
max_len

327

In [None]:
# prompt: truncate sequences that are more than 128 length

q_a_corpus = [s[:128] for s in q_a_corpus]


In [None]:
q_a_train_data, q_a_test_data = train_test_split(q_a_corpus, test_size=0.15, random_state=seed)

In [None]:
len(q_a_train_data), len(q_a_test_data)

(31561, 5570)

In [None]:
class TextGenDataset(torch.utils.data.Dataset):
  def __init__(self, data, block_size):
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    x = torch.tensor(pad_sequence(self.data[idx][:-1], self.block_size))
    y = torch.tensor(pad_sequence(self.data[idx][1:], self.block_size))
    mask = create_causal_mask(self.block_size, len(self.data[idx][: -1]))
    return x, y, mask.unsqueeze(0)

In [None]:
train_dataset = TextGenDataset(q_a_train_data, block_size)
test_dataset = TextGenDataset(q_a_test_data, block_size)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.optim.lr_scheduler import LambdaLR
from torch.optim.optimizer import Optimizer

def get_lr_lambda(warmup_steps, num_training_steps, lr_max):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        progress = float(current_step - warmup_steps) / float(max(1, num_training_steps - warmup_steps))
        return max(0.0, 0.5 * (1.0 + np.cos(np.pi * progress))) * lr_max / lr_max
    return lr_lambda



In [None]:
# function to train the decoder only classifier with learning rate scheduler and weight decay so that the model will not overlap
def train(model, num_epochs, batch_size, block_size, learning_rate, weight_decay, grad_clip, device):
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  criterion = nn.CrossEntropyLoss(ignore_index=0)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=1)
  model.train()
  for epoch in range(num_epochs):
    total_loss = 0
    for x, y, mask in train_loader:
      x = x.to(device)
      y = y.to(device)
      mask = mask.to(device)
      logits = model(x, mask)
      loss = criterion(logits.reshape(-1, logits.shape[-1]), y.reshape(-1))
      total_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
      optimizer.step()
      optimizer.zero_grad(set_to_none=True)
    print(f"Epoch {epoch+1} ============================= Total Loss {total_loss/len(train_loader)}")

    # Validation
    model.eval()
    total_val_loss = 0
    for x, y, mask in test_loader:
      x = x.to(device)
      y = y.to(device)
      mask = mask.to(device)
      logits = model(x, mask)
      loss = criterion(logits.reshape(-1, logits.shape[-1]), y.reshape(-1))
      total_val_loss += loss.item()
    print(f"Validation Loss {total_val_loss/len(test_loader)}")
    scheduler.step(total_val_loss)
    model.train()

In [None]:
# def train(model, num_epochs, batch_size, block_size, learning_rate, weight_decay, grad_clip, steps_per_epoch, num_steps_per_val, device):
#   optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   criterion = nn.CrossEntropyLoss()
#   model.train()
#   for epoch in range(num_epochs):
#     total_loss = 0
#     for step in range(steps_per_epoch):
#       x, y, mask = getBatch(batch_size, block_size)
#       x = x.to(device)
#       y = y.to(device)
#       mask = mask.to(device)
#       logits = model(x, mask)
#       loss = criterion(logits.reshape(-1, logits.shape[-1]), y.reshape(-1))
#       total_loss += loss.item()
#       loss.backward()
#       torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
#       optimizer.step()
#       optimizer.zero_grad(set_to_none=True)
#     print(f"Epoch {epoch+1} Total Loss {total_loss/steps_per_epoch}")

#     ## Validation
#     model.eval()
#     total_val_loss = 0
#     for step in range(num_steps_per_val):
#       x, y, mask = getBatch(batch_size, block_size, train=False)
#       x = x.to(device)
#       y = y.to(device)
#       mask = mask.to(device)
#       logits = model(x, mask)
#       loss = criterion(logits.reshape(-1, logits.shape[-1]), y.reshape(-1))
#       total_val_loss += loss.item()
#     print(f"Validation Loss {total_val_loss/num_steps_per_val}")
#     model.train()

## init

In [None]:
model = Transformer(vocab_size, d_model, max_seq_len, num_layers, num_heads, d_ff, dropout)

In [None]:
model = model.to(device)

In [None]:
train(model, 300, batch_size, block_size, learning_rate, weight_decay, grad_clip, device) #10

Validation Loss 4.838316494768316
Validation Loss 4.259146755391901
Validation Loss 4.054170743985609
Validation Loss 3.9103593826293945
Validation Loss 3.775225975296714
Validation Loss 3.650706404989416
Validation Loss 3.5411910848184065
Validation Loss 3.456516916101629
Validation Loss 3.3827827139334246
Validation Loss 3.3169872706586663
Validation Loss 3.2662028182636607
Validation Loss 3.2265270298177544
Validation Loss 3.1703719767657192
Validation Loss 3.1341535340655935
Validation Loss 3.1046248403462497
Validation Loss 3.068807937882163
Validation Loss 3.04978626424616
Validation Loss 3.0245448567650537
Validation Loss 2.9960607344453987
Validation Loss 2.9802910522981123
Validation Loss 2.956335349516435
Validation Loss 2.945240492170507
Validation Loss 2.925769865512848
Validation Loss 2.911747834899209
Validation Loss 2.89249111847444
Validation Loss 2.8801107677546414
Validation Loss 2.862288312478499
Validation Loss 2.8544835935939443
Validation Loss 2.844787532633001
Va

KeyboardInterrupt: 

## Saving the trained model

In [None]:
torch.save(model.state_dict(), 'model-final.pth')

## Testing

In [None]:
#80
def generate(model, sep_id, device, max_token_len):
  model.eval()
  x = torch.tensor([sep_id]).to(device)
  for _ in range(max_token_len):
    logits = model(x, None)
    logits = logits[:, -1, :]
    logits = nn.functional.softmax(logits, dim=-1)
    idx = torch.multinomial(logits, 1)
    x = torch.cat([x, idx], dim=-1)
  return x.tolist()

In [None]:
idx = sp.EncodeAsIds('venus has')

In [None]:
seq = generate(model, idx, device, 5)

In [None]:
decode(seq)

['venus has bright spots outward']

# Further Training the Model with Questions and Answers

In [None]:
df = pd.read_csv('/content/solarsystem.csv', names=['question', 'answer'])

In [None]:
sep_id = 1

In [None]:
df['q-a'] = df['question'].apply(lambda x: encode(" ".join([ re.sub(r'[^\w\s]', '', text) for text in x.lower().split()])) + [sep_id]) + df['answer'].apply(lambda x: encode(" ".join([ re.sub(r'[^\w\s]', '', text) for text in x.lower().split()])))

In [None]:
q_a_corpus = df['q-a'].tolist()

In [None]:
# find max len
max_len = 0
for i in q_a_corpus:
  max_len = max(max_len, len(i))
max_len

199

In [None]:
q_a_corpus = [i[:128] for i in q_a_corpus]

In [None]:
q_a_train_data, q_a_test_data = train_test_split(q_a_corpus, test_size=0.15, random_state=seed)

In [None]:
len(q_a_train_data), len(q_a_test_data)

(8554, 1510)

In [None]:
train_dataset = TextGenDataset(q_a_train_data, block_size)
test_dataset = TextGenDataset(q_a_test_data, block_size)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
train(model, 300, batch_size, block_size, learning_rate, weight_decay, grad_clip, device) #10

Validation Loss 2.3316138982772827
Validation Loss 2.0652825037638345
Validation Loss 1.9600170453389485
Validation Loss 1.8894522786140442
Validation Loss 1.8356522520383198
Validation Loss 1.7888804376125336
Validation Loss 1.7576660414536793
Validation Loss 1.721193253993988
Validation Loss 1.704488029082616
Validation Loss 1.6768265664577484
Validation Loss 1.6545295814673107
Validation Loss 1.640646368265152
Validation Loss 1.630838930606842
Validation Loss 1.60333455602328
Validation Loss 1.594539205233256
Validation Loss 1.5827094912528992
Validation Loss 1.5715673565864563
Validation Loss 1.5611151655515034
Validation Loss 1.5500263174374898
Validation Loss 1.541008174419403
Validation Loss 1.5344142417112987
Validation Loss 1.5208227237065632
Validation Loss 1.5155410766601562
Validation Loss 1.5039182305335999
Validation Loss 1.5018080274264018
Validation Loss 1.4953734179337819
Validation Loss 1.4865645170211792
Validation Loss 1.4841016630331676
Validation Loss 1.4788020352

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'model-qa01.pth')

In [None]:
idx = sp.EncodeAsIds('what is the size of earth?') + [1]

In [None]:
seq = generate(model, idx, device, 30)

In [None]:
decode(seq)

['what is the size of earth ⁇  the diameter of earths diameter is approximately 682152 kilometers 32 miles valiformly billion']

# SVM implementation for intent classification

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [None]:
intent_df = pd.read_csv('intent dataset.csv', names = ['text', 'intent'])

In [None]:
intent_df.head()

Unnamed: 0,text,intent
0,What is the distance between Earth and Mars?,GetInfo
1,Tell me about the largest moon in our solar sy...,GetInfo
2,What are the phases of the moon?,GetInfo
3,How many planets are there in the solar system?,GetInfo
4,Explain the concept of black holes.,GetInfo


In [None]:
intent_df.describe()

Unnamed: 0,text,intent
count,1168,1168
unique,1031,4
top,Explain the concept of the magnetosphere of Ga...,GetInfo
freq,9,616


In [None]:
intent_df['intent_label'] = intent_df['intent'].astype('category').cat.codes

In [None]:
intent_df.head()

Unnamed: 0,text,intent,intent_label
0,What is the distance between Earth and Mars?,GetInfo,0
1,Tell me about the largest moon in our solar sy...,GetInfo,0
2,What are the phases of the moon?,GetInfo,0
3,How many planets are there in the solar system?,GetInfo,0
4,Explain the concept of black holes.,GetInfo,0


In [None]:
intent_df[intent_df['intent'] == 'GetInfo'].head(1)

Unnamed: 0,text,intent,intent_label
0,What is the distance between Earth and Mars?,GetInfo,0


In [None]:
intent_df[intent_df['intent'] == 'UserLike'].head(1)

Unnamed: 0,text,intent,intent_label
621,I'm fascinated by the mysteries of black holes...,UserLike,3


In [None]:
intent_df[intent_df['intent'] == 'Other'].head(1)

Unnamed: 0,text,intent,intent_label
7,Describe the process of photosynthesis.,Other,1


In [None]:
intent_df[intent_df['intent'] == 'UserInfo'].head(1)

Unnamed: 0,text,intent,intent_label
721,"Hi, I'm Sarah, and I'm a huge fan of space exp...",UserInfo,2


In [None]:
intent_df['intent'].unique()

array(['GetInfo', 'Other', 'UserLike', 'UserInfo'], dtype=object)

In [None]:
intent_train,  intent_test = train_test_split(intent_df, test_size=0.2, random_state=42)

In [None]:
X_train, y_train = intent_train['text'].tolist(), intent_train['intent_label'].tolist()
X_test, y_test = intent_test['text'].tolist(), intent_test['intent_label'].tolist()

In [None]:
intent_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('svc', SVC())])

In [None]:
intent_pipeline.fit(X_train, y_train)


In [None]:
predictions = intent_pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       132
           1       0.96      0.92      0.94        25
           2       1.00      0.97      0.99        34
           3       0.91      0.98      0.94        43

    accuracy                           0.98       234
   macro avg       0.97      0.96      0.97       234
weighted avg       0.98      0.98      0.98       234



In [None]:
def get_intent(text):
    intent = intent_pipeline.predict([text])[0]
    return intent

In [None]:
get_intent("What is the distance between earth and sun")

0

# Chat Bot Integration

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")



In [None]:
def get_name(text):
  doc = nlp(text)
  for ent in doc.ents:
    if ent.label == "PERSON":
      return ent.text

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import spacy
import vaderSentiment.vaderSentiment as vs

def get_likes_dislikes(text):
    # Load the VADER sentiment analyzer
    analyzer = vs.SentimentIntensityAnalyzer()

    # Parse the text with spaCy
    doc = nlp(text)

    # Initialize lists for likes and dislikes
    likes = []
    dislikes = []
    sentiment = analyzer.polarity_scores(text)
    # Iterate over the tokens in the parsed text
    for token in doc:
        # Check if the token is a noun phrase
        if token.dep_ == "dobj":
            # Get the sentiment of the noun phrase

            # If the sentiment is positive, add it to the likes list
            if sentiment["compound"] > 0:
                likes.append(token.text)

            # If the sentiment is negative, add it to the dislikes list
            elif sentiment["compound"] < 0:
                dislikes.append(token.text)

    # Return the likes and dislikes lists
    return likes, dislikes


In [None]:
get_likes_dislikes("I like pizza")

(['pizza'], [])

In [None]:
def get_user_data(text):
  # Parse the text
  doc = nlp(text)

  # Initialize variables
  name = None
  age = None

  # Iterate over the tokens in the parsed text
  for ent in doc.ents:
    # Check if the entity is a person
    if ent.label_ == "PERSON":
      name = ent.text

    # Check if the entity is a date
    if ent.label_ == "DATE":
      # Extract the age from the date
      age = ent.text.split()[0]

  return name, age

In [None]:
# User model
import os
import pickle
class User():
  def __init__(self):
    self.name = ""
    self.file = ""
    self.is_new_user = False
    self.age = 0
    self.likes = []
    self.dislikes = []

  # check if current user is new or not
  def isNewUser(self, name):
    self.name = name
    if not os.path.isdir("./users"):
      os.mkdir("./users")
    if not os.path.isfile("./users/"+self.name+".pickle"):
      self.is_new_user = True
    self.file = "./users/"+self.name+".pickle"
    return self.is_new_user

  # gather the current user details
  def getUserDetails(self):
    name , age = get_user_data(text)
    if name is not None:
      self.name = name
    if age is not None:
      self.age = age
    self.saveUserDetails()

  # save the user data into a pickle file
  def saveUserDetails(self):
    data = {
      "name": self.name,
      "age": self.age,
      "likes": self.likes,
      "dislikes": self.dislikes
    }
    with open(self.file, "wb") as f:
      pickle.dump(data, f)

  def updateUserInfo(self, text):
    if "age" in text:
      self.age = input("What is your age?\n")
    elif "name" in text:
      self.age = input("What is your name?\n")
    self.saveUserDetails()


  def updateUserLikes(self, text):
    self.likes, self.dislikes = get_likes_dislikes(text)
    self.saveUserDetails()


In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")

In [None]:
def chatbot():
  user = User()
  name = input("planetBot: Hey There! Im planetBot. What is your name?\n")

  if not user.isNewUser(name):
    user.loadUserDetails()

  bot = Bot(user, generate, model)
  while True:
    text = input("You: ")
    if text == "quit":
      break
    intent = bot.get_intent(text)
    if intent == 0:
      resp = bot.generateResponse(text)
      print("planetBot: ", resp)
    elif intent == 2:
      user.updateUserInfo(text)
      print("planetBot: I updated your details")
    elif intent == 3:
      user.updateUserLikes(text)
      print("planetBot: I updated your likes and dislikes")
    else:
      print("planetBot: I don't understand")

In [None]:
chatbot()

planetBot: Hey There! Im planetBot. What is your name?
tarun
You: what does sun outer layer have?
planetBot:  what does sun outer layer have ⁇  the sun has superved rings that at the core surrounded to our solar system composition over or after sunspots and traveling of the visible inner


KeyboardInterrupt: Interrupted by user

In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")

In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")

In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")

In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")

In [None]:
class Bot():
  def __init__(self, user, generate, model):
    self.user = user
    self.generate = generate
    self.model = model
    self.max_resp_len = 40

  def get_intent(self, text):
    intent = intent_pipeline.predict([text])[0]
    return intent

  def generateResponse(self, text):
    seq = sp.EncodeAsIds(text) + [1]
    resp = self.generate(self.model, seq, device, self.max_resp_len)
    return decode(resp[0]).replace(text, "")