In [85]:
from sentence_transformers import SentenceTransformer, util

import pandas as pd
import numpy as np
from typing import List

#Load the model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [86]:
def _file_load(file_path:str):
    return pd.read_csv(file_path)

def _compute_scores(query, docs_embed):
    return util.cos_sim(query, docs_embed)[0].cpu().tolist()

def _print_results(doc:str, score:str, ticket:str):
    print(f"The most simillar ticket is: {doc} -- with {round(score, 4)} -- number of tickets is {ticket}")
    
def _model_predict(text:List[str]):
    return model.encode(text)   

In [87]:
def most_similar(query: str, docs: List[str] = train_abstract, ticket_numbers: List[int] = ticket_number,
                 k_most_simillar: int = 5, docs_embed = None):
    
    print(f"Your query is: '{query}'")

    query_embed = _model_predict(query)
    if docs_embed is None:
        docs_embed = _model_predict(docs)

    try:
        scores = _compute_scores(query_embed, docs_embed)
    except:
        raise ValueError("You have to set up 'docs_embed' parameter")
        
    doc_score_pairs = list(zip(docs, scores, ticket_numbers))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

    for doc, score, ticket in doc_score_pairs[:k_most_simillar]:
        _print_results(doc, score, ticket)

In [88]:
file_path = '../datasets/ML_DATASET_Hackathon_Supervised.csv'
embed_path = '../data/embeddings_mpnet.npy'


docs_embeddings = np.load(embed_path)
df = _file_load(file_path)

abstracts = df['Problem_Abstract'].to_list()
ticket_n = df['Ticket'].to_list()

In [91]:
query = "Order me some pizza"

most_similar(query, abstracts, ticket_n, 5, docs_embeddings)

Your query is: 'Order me some pizza'
The most simillar ticket is: Orders Issued -- with 0.3476 -- number of tickets is 301662522
The most simillar ticket is: Order Missing  -- with 0.2674 -- number of tickets is 313374392
The most simillar ticket is: Please assign this request to Derek Pang -- with 0.2668 -- number of tickets is 307262726
The most simillar ticket is: Missing Order  -- with 0.2476 -- number of tickets is 309374247
The most simillar ticket is: 9 orders need to be removed from the Pending RIB report - they are either complete or have been canceled. -- with 0.2393 -- number of tickets is 312301539
