In [None]:
! pip install evaluate
! pip install torch
! pip install transformers

dense passage retriever

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

tokenizer_question = DPRQuestionEncoderTokenizer.from_pretrained("dpr-question_encoder-single-nq-base")
model_question = DPRQuestionEncoder.from_pretrained("dpr-question_encoder-single-nq-base")

input_ids = tokenizer_question("what is ChatGPT ?", return_tensors="pt")["input_ids"]
embeddings_q = model_question(input_ids).pooler_output
print(embeddings_q)

In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

tokenizer_context = DPRContextEncoderTokenizer.from_pretrained("dpr-ctx_encoder-single-nq-base")
model_context = DPRContextEncoder.from_pretrained("dpr-ctx_encoder-single-nq-base")

input_ids = tokenizer_context("ChatGPT is an artificial intelligence chatbot developed by OpenAI.", return_tensors="pt")["input_ids"]
embeddings_p1 = model_context(input_ids).pooler_output
print(embeddings_p1.shape)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


torch.Size([1, 768])


In [None]:
import torch
score1 = torch.mm(embeddings_q, embeddings_p1.transpose(0, 1))[0].cpu().tolist()
print('The score between the user query and the passage 1 is: ', score1)

The score between the user query and the passage 1 is:  [94.91859436035156]


In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
input_ids = tokenizer_context("A chatbot amis to simulate human communication.", return_tensors="pt")["input_ids"]
embeddings_p2 = model_context(input_ids).pooler_output
print(embeddings_p2.shape)

torch.Size([1, 768])


In [None]:
score1 = torch.mm(embeddings_q, embeddings_p2.transpose(0, 1))[0].cpu().tolist()
print('The score between the user query and the passage 2 is: ', score1)

The score between the user query and the passage 2 is:  [70.62039184570312]


## DualEncoder

Using the query encoder to do the retrieval

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

tokenizer_question = DPRQuestionEncoderTokenizer.from_pretrained("dpr-question_encoder-single-nq-base")
model_question = DPRQuestionEncoder.from_pretrained("dpr-question_encoder-single-nq-base")

input_ids = tokenizer_question("what is ChatGPT ?", return_tensors="pt")["input_ids"]
embeddings_q = model_question(input_ids).pooler_output
print(embeddings_q.shape)

torch.Size([1, 768])


In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

input_ids = tokenizer_question("ChatGPT is an artificial intelligence chatbot developed by OpenAI.", return_tensors="pt")["input_ids"]
embeddings_p1 = model_question(input_ids).pooler_output
print(embeddings_p1.shape)

torch.Size([1, 768])


In [None]:
import torch

score1 = torch.mm(embeddings_q, embeddings_p1.transpose(0, 1))[0].cpu().tolist()
print('The score between the user query and the passage 1 is: ', score1)

The score between the user query and the passage 1 is:  [89.18319702148438]


In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

input_ids = tokenizer_question("A chatbot amis to simulate human communication.", return_tensors="pt")["input_ids"]
embeddings_p2 = model_question(input_ids).pooler_output

print(embeddings_p2.shape)

torch.Size([1, 768])


In [None]:
import torch
score2 = torch.mm(embeddings_q, embeddings_p2.transpose(0, 1))[0].cpu().tolist()
print('The score between the user query and the passage 2 is: ', score2)

The score between the user query and the passage 2 is:  [78.11785125732422]


# Chatbots

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
tokenizer = GPT2Tokenizer.from_pretrained("personGPT")
model = GPT2LMHeadModel.from_pretrained("personGPT")
if torch.cuda.is_available():
    model = model.cuda()
## utility functions ##
flatten = lambda l: [item for sublist in l for item in sublist]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def to_data(x):
    if torch.cuda.is_available():
        x = x.cpu()
    return x.data.numpy()

In [None]:
def to_var(x):
    if not torch.is_tensor(x):
        x = torch.Tensor(x)
    if torch.cuda.is_available():
        x = x.cuda()
    return x

In [None]:
def display_dialog_history(dialog_hx):
    for j, line in enumerate(dialog_hx):
        msg = tokenizer.decode(line)
        if j %2 == 0:
            print(">> User: "+ msg)
        else:
            print("Bot: "+msg)
            print()

In [None]:
def generate_next(bot_input_ids, do_sample=True, top_k=10, top_p=.92,
                  max_length=1000, pad_token=tokenizer.eos_token_id):
    full_msg = model.generate(bot_input_ids, do_sample=True,
                                              top_k=top_k, top_p=top_p,
                                              max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    msg = to_data(full_msg.detach()[0])[bot_input_ids.shape[-1]:]
    return msg

In [None]:
#Defining personality of the Chatbot
personas = []
for i in range(3):
    response = input(">> Fact %d: "%(i+1))+ tokenizer.eos_token
    personas.append(response)
personas = tokenizer.encode(''.join(['<|p2|>'] + personas + ['<|sep|>'] + ['<|start|>']))

>> Fact 1:  I live in Munich
>> Fact 2:  I like playing football.
>> Fact 3:  I have two dogs.


In [None]:
# converse for 8 turns
dialog_hx = []
for step in range(4):
    # encode the user input
    user_inp = tokenizer.encode(input(">> User: ") + tokenizer.eos_token)
    # append to the chat history
    dialog_hx.append(user_inp)

    # generated a response while limiting the total chat history to 1000 tokens,
    bot_input_ids = to_var([personas + flatten(dialog_hx)]).long()
    msg = generate_next(bot_input_ids)
    dialog_hx.append(msg)
    print("Bot: {}".format(tokenizer.decode(msg, skip_special_tokens=True)))

>> User:  Hello, How are you?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Bot: hello i am doing great! what about yourself?


>> User:  I am good. I saw your dogs.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Bot: that is nice. i like playing football with my dogs.


>> User:  Really, how many dogs do you have?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Bot: two small dogs.


>> User:  Great! I also want to play football.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Bot: do you play professional football at all?


In [None]:
## available actions, action space ##
action_space = [ 'ask about kids.', "ask about pets.", 'talk about work.',
               'ask about marital status.', 'talk about travel.', 'ask about age and gender.',
        'ask about hobbies.', 'ask about favorite food.', 'talk about movies.',
        'talk about music.', 'talk about politics.']
# converse for 8 turns
dialog_hx = []
for step in range(4):
    # choose an action
    act = None
    while act not in action_space:
        display_dialog_history(dialog_hx)
        print()
        print(" actions: ")
        for k,v in enumerate(action_space): print(k,v)
        try:
            act = action_space[int(input(" input [0-10]: " ))]
        except:
            act = None
    print()
    # format into prefix code
    action_prefix = tokenizer.encode(''.join(['<|act|> '] + [act] + ['<|p1|>'] + [] + ['<|sep|>'] + ['<|start|>']))
    bot_input_ids = to_var([action_prefix + flatten(dialog_hx)]).long()

    # generate query conditioned on action
    msg = generate_next(bot_input_ids, top_k=10, top_p=0.92, max_length=100)
    dialog_hx.append(msg)

    # generate bot response
    bot_input_ids = to_var([personas+ flatten(dialog_hx)]).long()
    msg = generate_next(bot_input_ids, top_k=10, top_p=0.92, max_length=100)
    dialog_hx.append(msg)
display_dialog_history(dialog_hx)


 actions: 
0 ask about kids.
1 ask about pets.
2 talk about work.
3 ask about marital status.
4 talk about travel.
5 ask about age and gender.
6 ask about hobbies.
7 ask about favorite food.
8 talk about movies.
9 talk about music.
10 talk about politics.


 input [0-10]:  5





A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> User: hello what is your age and gender?<|endoftext|>
Bot: hi there. i'm a 30 year old male.<|endoftext|>


 actions: 
0 ask about kids.
1 ask about pets.
2 talk about work.
3 ask about marital status.
4 talk about travel.
5 ask about age and gender.
6 ask about hobbies.
7 ask about favorite food.
8 talk about movies.
9 talk about music.
10 talk about politics.


 input [0-10]:  1


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



>> User: hello what is your age and gender?<|endoftext|>
Bot: hi there. i'm a 30 year old male.<|endoftext|>

>> User: do you have any pets?<|endoftext|>
Bot: i have 2 dogs.<|endoftext|>


 actions: 
0 ask about kids.
1 ask about pets.
2 talk about work.
3 ask about marital status.
4 talk about travel.
5 ask about age and gender.
6 ask about hobbies.
7 ask about favorite food.
8 talk about movies.
9 talk about music.
10 talk about politics.


 input [0-10]:  4


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



>> User: hello what is your age and gender?<|endoftext|>
Bot: hi there. i'm a 30 year old male.<|endoftext|>

>> User: do you have any pets?<|endoftext|>
Bot: i have 2 dogs.<|endoftext|>

>> User:  where you from?<|endoftext|>
Bot: i live in m nchen. i love football.<|endoftext|>


 actions: 
0 ask about kids.
1 ask about pets.
2 talk about work.
3 ask about marital status.
4 talk about travel.
5 ask about age and gender.
6 ask about hobbies.
7 ask about favorite food.
8 talk about movies.
9 talk about music.
10 talk about politics.


 input [0-10]:  9


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



>> User: hello what is your age and gender?<|endoftext|>
Bot: hi there. i'm a 30 year old male.<|endoftext|>

>> User: do you have any pets?<|endoftext|>
Bot: i have 2 dogs.<|endoftext|>

>> User:  where you from?<|endoftext|>
Bot: i live in m nchen. i love football.<|endoftext|>

>> User: do you play an instrument?<|endoftext|>
Bot: yes. i play the flute.<|endoftext|>



## Evaluation

generated sentence: There is 1 expensive modern european restaurant. Do you want me to book it for you?

groundtruth: I found 2 expensive european restaurants and 1 expensive modern european restaurant. Which kind would you prefer?

In [None]:
from evaluate import load
bleu = load("sacrebleu")
print(bleu.compute(predictions=["There is 1 expensive modern european restaurant. Do you want me to book it for you?"],
                   references=["I found 2 expensive european restaurants and 1 expensive modern european restaurant. Which kind would you prefer?"])['score'])

26.896741624879542
