<center><h1>Covid-19 Chatbot</h1></center>

## Import Package

In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import re
import os
import unicodedata
import itertools

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

## Read Data

In [52]:
import pandas as pd
data = pd.read_excel('WHO_FAQ.xlsx')

Caculate how many words in answers

In [53]:
data['Length']= data['Answer'].apply(lambda x: len(x.split(' ')))

In [54]:
data

Unnamed: 0,Context,Answer,Length
0,What is a coronavirus?,Coronaviruses are a large family of viruses wh...,15
1,What is a coronavirus?,"In humans, several coronaviruses are known to ...",19
2,What is COVID-19?,COVID-19 is the infectious disease caused by t...,29
3,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are fever...,35
4,What are the symptoms of COVID-19?,Some people become infected but don’t develop ...,25
...,...,...,...
80,Are smokers and tobacco users at higher risk o...,Smokers are likely to be more vulnerable to CO...,38
81,Are smokers and tobacco users at higher risk o...,Smoking products such as water pipes often inv...,27
82,How large does a meeting or event need to be i...,High profile international sporting events suc...,26
83,How large does a meeting or event need to be i...,An event counts as a “mass gatherings” if the ...,41


### Data Cleaning

In [55]:
MAX_LENGTH = 100  

# def unicodeToAscii(s):
#     return ''.join(
#         c for c in unicodedata.normalize('NFD', s)
#         if unicodedata.category(c) != 'Mn'
#     )
 
def normalize(s):
#     s = unicodeToAscii(s.lower().strip())
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s


First make all words lower and remove leading and trailing spaces, add a space before punctuations in order to treat punctuation as a word. Next, make string which is not a word or a punctuation into space. Last, replace multiple spaces into one space and remove the leading and trailing spaces again.

In [56]:
pairs = [[data['Context'].apply(normalize).map(str)[i],
         data['Answer'].apply(normalize).map(str)[i]] for i in range(data.shape[0])]

### Build Vocabulary

This step is like a tokenizer, so define a class, which will save the mapping from words to indexes, and also reverse mapping from indexes to words. In addition, it also records the number of occurences of each word and the total number of words that appear. This class provides the addWord method to add a word, the addSentence method to add sentences, and the method trim to remove low-frequency words.

In [57]:
PAD_token = 0  # padding 
SOS_token = 1  # start of sentence token
EOS_token = 2  # end of sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # remove token below min_count
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 
        
        for word in keep_words:
            self.addWord(word)

Since we don't have a lot of data, so we don't filter sentences which are very long.

In [58]:
# def filterPair(p): 
#     return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# def filterPairs(pairs):
#     return [pair for pair in pairs if filterPair(pair)]

In [59]:
corpus_name = 'covid-vocabulary'
voc = Voc(corpus_name)
# pairs = filterPairs(pairs)
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)

Counted words: 699


In [61]:
print("pairs:")
for pair in pairs[:3]:
    print(pair)

pairs:
['what is a coronavirus ?', 'coronaviruses are a large family of viruses which may cause illness in animals or humans .']
['what is a coronavirus ?', 'in humans several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases']
['what is covid ?', 'covid is the infectious disease caused by the most recently discovered coronavirus . this new virus and disease were unknown before the outbreak began in wuhan china in december .']


In [68]:
batch_size = 5

In [69]:
pair_batch = [random.choice(pairs) for _ in range(batch_size)]
pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)

In [70]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def inputVar(batch, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in batch]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

def outputVar(batch, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in batch]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [72]:
input_batch, output_batch = [], []
for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
input_variable, lengths = inputVar(input_batch, voc)
target_variable, mask, max_target_len = outputVar(output_batch, voc)

## Build Model

In [84]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu())
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        return outputs, hidden

In [79]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()

        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [80]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [81]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

## Train Model 

In [89]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    loss = 0
    print_losses = []
    n_totals = 0

    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    decoder_hidden = encoder_hidden[:decoder.n_layers]

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_input = target_variable[t].view(1, -1)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    loss.backward()

    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [85]:
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

embedding = nn.Embedding(voc.num_words, hidden_size)

encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout).to(device)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout).to(device)

In [87]:
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 1000
print_every = 100

encoder.train()
decoder.train()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

In [90]:
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

start_iteration = 1
print_loss = 0

for iteration in range(start_iteration, n_iteration + 1):
    training_batch = training_batches[iteration - 1]

    input_variable, lengths, target_variable, mask, max_target_len = training_batch

    loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                 decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
    print_loss += loss

    if iteration % print_every == 0:
        print_loss_avg = print_loss / print_every
        print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
        print_loss = 0
        

  """
  allow_unreachable=True)  # allow_unreachable flag


Iteration: 10; Percent complete: 0.2%; Average loss: 6.1870
Iteration: 20; Percent complete: 0.5%; Average loss: 5.4768
Iteration: 30; Percent complete: 0.8%; Average loss: 5.2092
Iteration: 40; Percent complete: 1.0%; Average loss: 4.8273
Iteration: 50; Percent complete: 1.2%; Average loss: 4.2821
Iteration: 60; Percent complete: 1.5%; Average loss: 3.7214
Iteration: 70; Percent complete: 1.8%; Average loss: 3.1810
Iteration: 80; Percent complete: 2.0%; Average loss: 2.6134
Iteration: 90; Percent complete: 2.2%; Average loss: 2.0582
Iteration: 100; Percent complete: 2.5%; Average loss: 1.6542
Iteration: 110; Percent complete: 2.8%; Average loss: 1.2827
Iteration: 120; Percent complete: 3.0%; Average loss: 0.9944
Iteration: 130; Percent complete: 3.2%; Average loss: 0.7691
Iteration: 140; Percent complete: 3.5%; Average loss: 0.5878
Iteration: 150; Percent complete: 3.8%; Average loss: 0.4466
Iteration: 160; Percent complete: 4.0%; Average loss: 0.3599
Iteration: 170; Percent complete:

Iteration: 1340; Percent complete: 33.5%; Average loss: 0.0386
Iteration: 1350; Percent complete: 33.8%; Average loss: 0.0383
Iteration: 1360; Percent complete: 34.0%; Average loss: 0.0381
Iteration: 1370; Percent complete: 34.2%; Average loss: 0.0371
Iteration: 1380; Percent complete: 34.5%; Average loss: 0.0348
Iteration: 1390; Percent complete: 34.8%; Average loss: 0.0364
Iteration: 1400; Percent complete: 35.0%; Average loss: 0.0367
Iteration: 1410; Percent complete: 35.2%; Average loss: 0.0346
Iteration: 1420; Percent complete: 35.5%; Average loss: 0.0368
Iteration: 1430; Percent complete: 35.8%; Average loss: 0.0361
Iteration: 1440; Percent complete: 36.0%; Average loss: 0.0341
Iteration: 1450; Percent complete: 36.2%; Average loss: 0.0354
Iteration: 1460; Percent complete: 36.5%; Average loss: 0.0345
Iteration: 1470; Percent complete: 36.8%; Average loss: 0.0364
Iteration: 1480; Percent complete: 37.0%; Average loss: 0.0348
Iteration: 1490; Percent complete: 37.2%; Average loss:

Iteration: 2650; Percent complete: 66.2%; Average loss: 0.0325
Iteration: 2660; Percent complete: 66.5%; Average loss: 0.0341
Iteration: 2670; Percent complete: 66.8%; Average loss: 0.0332
Iteration: 2680; Percent complete: 67.0%; Average loss: 0.0339
Iteration: 2690; Percent complete: 67.2%; Average loss: 0.0343
Iteration: 2700; Percent complete: 67.5%; Average loss: 0.0326
Iteration: 2710; Percent complete: 67.8%; Average loss: 0.0342
Iteration: 2720; Percent complete: 68.0%; Average loss: 0.0347
Iteration: 2730; Percent complete: 68.2%; Average loss: 0.0324
Iteration: 2740; Percent complete: 68.5%; Average loss: 0.0339
Iteration: 2750; Percent complete: 68.8%; Average loss: 0.0333
Iteration: 2760; Percent complete: 69.0%; Average loss: 0.0330
Iteration: 2770; Percent complete: 69.2%; Average loss: 0.0331
Iteration: 2780; Percent complete: 69.5%; Average loss: 0.0331
Iteration: 2790; Percent complete: 69.8%; Average loss: 0.0332
Iteration: 2800; Percent complete: 70.0%; Average loss:

Iteration: 3960; Percent complete: 99.0%; Average loss: 0.0332
Iteration: 3970; Percent complete: 99.2%; Average loss: 0.0337
Iteration: 3980; Percent complete: 99.5%; Average loss: 0.0313
Iteration: 3990; Percent complete: 99.8%; Average loss: 0.0319
Iteration: 4000; Percent complete: 100.0%; Average loss: 0.0331


## Test Model

In [92]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        for _ in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        return all_tokens, all_scores

In [151]:
def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):

    words = []
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    for word in decoded_words:
        if word == 'EOS':
            break
        elif word != 'PAD':
            words.append(word)
    words = ' '.join(words)

    return words

In [152]:
def Interactive(searcher, voc):
    input_sentence = ''
    try:
        input_sentence = input('Please input what you want to ask about COVID-19 : ')
        input_sentence = normalize(input_sentence)
        output_words = evaluate(searcher, voc, input_sentence)
        print('COVID-19 Bot:\n', output_words)

    except KeyError:
        print("Please ask again. You can ask \"What is coronavirus?\", etc. ")

In [154]:
encoder.eval()
decoder.eval()

searcher = GreedySearchDecoder(encoder, decoder)

Interactive(searcher, voc)

Please input what you want to ask about COVID-19 : virus
COVID-19 Bot:
 while there are a number of prevention or treatment of covid . they should only be used as directed by a physician to treat a bacterial infection .


## Save Model

In [113]:
torch.save({
    'en': encoder.state_dict(),
    'de': decoder.state_dict(),
    'en_opt': encoder_optimizer.state_dict(),
    'de_opt': decoder_optimizer.state_dict(),
    'voc_dict': voc.__dict__,
    'embedding': embedding.state_dict()
}, 'savedWeight.tar')