In [1]:
# ! pip install pymilvus==2.3.1

In [2]:
from dotenv import load_dotenv
import pandas as pd
import os
from pymilvus import connections, Collection, utility
from sentence_transformers import SentenceTransformer

### Vectorization using Sentence transformers

In [3]:
class TextVectorizer:
    '''
    sentence transformers to extract sentence embeddings
    '''
    def vectorize(self, x: pd.Series, dataset: str = "train"):
        x = x.copy()
        model = SentenceTransformer('bert-base-nli-mean-tokens')
        sen_embeddings = model.encode(x)
        return sen_embeddings

In [4]:
vectorizer = TextVectorizer()

In [5]:
# Reading milvus URI & API token From secrets.env
load_dotenv('secrets.env')
uri = os.environ.get("URI")
token = os.environ.get("TOKEN")

In [6]:
# connecting to db
connections.connect("default", uri=uri, token=token)
print(f"Connected to DB")

Connected to DB


In [7]:
collection_name = os.environ.get("COLLECTION_NAME")
check_collection = utility.has_collection(collection_name)
check_collection # checks if collection exisits

True

In [8]:
# load the collection before querying
collection = Collection(name=collection_name)
collection.load()

In [9]:
def find_similar_news(text: str, top_n: int=3):
    search_params = {"metric_type": "L2"}
    search_vec = vectorizer.vectorize([text])
    result = collection.search(search_vec,
                                anns_field='article_embed', # annotations field specified in the schema definition
                                param=search_params,
                                limit=top_n,
                                guarantee_timestamp=1, 
                                output_fields=['article_desc']) # which fields to return in output
    
    output_dict = {"input_text": text, "similar_texts": [hit.entity.get('article_desc') for hits in result for hit in hits]} 
    similar_txt = '\n\n'.join(output_dict.get('similar_texts'))
    print(f"INPUT\n{'-'*5}{text}\n\nSIMILAR NEWS\n{'-'*12}\n{similar_txt}")
    return output_dict

In [10]:
text = '''
Formula E nominated for BBC Green Sport Award. 
The ABB FIA Formula E World Championship has been nominated by a BBC panel for its Green Sport Awards, which celebrate the good news
'''

_ = find_similar_news(text, top_n=1)

INPUT
-----
Formula E nominated for BBC Green Sport Award. 
The ABB FIA Formula E World Championship has been nominated by a BBC panel for its Green Sport Awards, which celebrate the good news


SIMILAR NEWS
------------
FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup.


In [11]:
text = '''
Lib Dem conference: Ed Davey pounds Tories in election warm-up speech
Sir Ed Davey makes it clear who the Lib Dems are targeting at the general election - but will it work?
'''

_ = find_similar_news(text, top_n=1)

INPUT
-----
Lib Dem conference: Ed Davey pounds Tories in election warm-up speech
Sir Ed Davey makes it clear who the Lib Dems are targeting at the general election - but will it work?


SIMILAR NEWS
------------
The state's general treasurer is slated to face former Cranston Mayor Allan Fung (R) in the general election.
