In [1]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util



In [2]:
def get_dataset(path):
    dataset = pd.read_excel(path, engine='openpyxl') 
    return dataset.astype(str)

In [3]:
# load the knowledge base
path = "../data/dataset.xlsx"
dataset = get_dataset(path)

In [4]:
# load a sentence-transformer model
model = SentenceTransformer('../models/paraphrase-distilroberta-base-v1')

In [5]:
# encode queries from knowledge base to create corpus embeddings
corpus_embeddings = model.encode(dataset['Query'].tolist(), convert_to_tensor=True)

In [6]:
# user query
query = "I want to pay my bill"

In [7]:
# find the closest `top_k` queries of the corpus for the user query based on cosine similarity
top_k = 3

# encode user query
query_embedding = model.encode(query, convert_to_tensor=True)

In [8]:
# use cosine-similarity and torch.topk to find the highest `top_k` scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
cos_scores

tensor([0.5136, 0.3416, 0.2005, 0.0468, 0.3057, 0.5580, 0.3149, 0.7042, 0.1120])

In [9]:
top_results = torch.topk(cos_scores, k=min(top_k, dataset.shape[0]))
top_results

torch.return_types.topk(
values=tensor([0.7042, 0.5580, 0.5136]),
indices=tensor([7, 5, 0]))

In [10]:
# filter dataframe by list of index
df = dataset.iloc[top_results[1], :]

# add matched score
df['Score'] = ["{:.4f}".format(value) for value in top_results[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
# show the result
df.head()

Unnamed: 0,Query,Response,Score
7,How can I pay my bill?,Thank you for connecting. You can pay by visit...,0.7042
5,Please send outstanding bills for my service,Thank you for connecting. You will receive the...,0.558
0,I would like to pay online?,Thank you for connecting. You can pay by visit...,0.5136


In [12]:
# convert the result to dict
df.to_dict('records')

[{'Query': 'How can I pay my bill?',
  'Response': 'Thank you for connecting. You can pay by visiting http://abc.com/pay',
  'Score': '0.7042'},
 {'Query': 'Please send outstanding bills for my service',
  'Response': 'Thank you for connecting. You will receive the outstanding bills on registered email address.',
  'Score': '0.5580'},
 {'Query': 'I would like to pay online?',
  'Response': 'Thank you for connecting. You can pay by visiting http://abc.com/pay',
  'Score': '0.5136'}]

In [13]:
# lets put it all together
def get_query_responses(query, top_k=3):
    '''find the closest `top_k` queries of the corpus for the user query based on cosine similarity'''
    
    # encode user query
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # use cosine-similarity and torch.topk to find the highest `top_k` scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=min(10, dataset.shape[0]))
    
    # filter dataframe by list of index
    df = dataset.iloc[top_results[1], :]
    
    # add matched score
    df['Score'] = ["{:.4f}".format(value) for value in top_results[0]]
    
    # select top_k responses
    responses = df.to_dict('records')
    
    return responses

In [14]:
# show the result response
responses = get_query_responses(question, top_k=1)
responses

NameError: name 'question' is not defined