In [1]:
import pandas as pd

pdf = pd.read_csv(f"labelled_newscatcher_dataset.csv", sep=";")
pdf["id"] = pdf.index

In [2]:
from sentence_transformers import InputExample

pdf_subset = pdf.head(1000)

def example_create_fn(doc1: pd.Series) -> InputExample:
    """
    Helper function that outputs a sentence_transformer guid, label, and text
    """
    return InputExample(texts=[doc1])

faiss_train_examples = pdf_subset.apply(
    lambda x: example_create_fn(x["title"]), axis=1
).tolist()

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)
faiss_title_embedding = model.encode(pdf_subset.title.values.tolist())
len(faiss_title_embedding), len(faiss_title_embedding[0])

(1000, 384)

In [4]:
import numpy as np
import faiss

pdf_to_index = pdf_subset.set_index(["id"], drop=False)
id_index = np.array(pdf_to_index.id.values).flatten().astype("int")

content_encoded_normalized = faiss_title_embedding.copy()
faiss.normalize_L2(content_encoded_normalized) # we do this step in order to make the dot product equal to cosine similarity

# Index1DMap translates search results to IDs: https://faiss.ai/cpp_api/file/IndexIDMap_8h.html#_CPPv4I0EN5faiss18IndexIDMapTemplateE
# The IndexFlatIP below builds index
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_title_embedding[0]))) #IndexFlatIP means no vector compression is involved
#IndexIDMap is merely a function to maintain mapping between your IDs and embeddings
index_content.add_with_ids(content_encoded_normalized, id_index)

#### We define a search function below to first vectorize our query text, and then search for the vectors with the closest distance.



In [5]:
def search_content(query, pdf_to_index, k=3):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    # We set k to limit the number of vectors we want to return
    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = pdf_to_index.loc[ids]
    results["similarities"] = similarities
    return results

In [6]:
print(search_content("meteor", pdf_to_index))

       topic                                               link  \
id                                                                
294  SCIENCE  https://www.haveeru.com.mv/perseid-meteor-show...   
113  SCIENCE  https://www.goshennews.com/news/rachel-shenk-m...   
316  SCIENCE  https://www.firstpost.com/tech/science/perseid...   

             domain       published_date  \
id                                         
294  haveeru.com.mv  2020-08-10 21:13:55   
113  goshennews.com  2020-08-17 02:15:00   
316   firstpost.com  2020-08-12 03:17:12   

                                                 title lang   id  similarities  
id                                                                              
294  Perseid meteor shower 2020: How and when to lo...   en  294      0.590013  
113  RACHEL SHENK: Meteor shower is reminder of nat...   en  113      0.573253  
316  Perseid meteor shower peaks tonight: Interesti...   en  316      0.561288  


#### Using ChromaDB to actually retrieve, augment and generate text based on search queries

In [14]:
import chromadb
from chromadb.config import Settings

chroma_client = chromadb.Client(
    Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory='.chromadb/',  # this is an optional argument. If you don't supply this, the data will be ephemeral
    )
)

Using embedded DuckDB with persistence: data will be stored in: .chromadb/


#### "collection" here is an index that stores one set of your documents
#### It is where you will store your embeddings, documents and additional metadata

In [15]:
collection_name = "my_news"

# If you have created the collection before, you need to delete the collection first
if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
    chroma_client.delete_collection(name=collection_name)

print(f"Creating collection: '{collection_name}'")
collection = chroma_client.create_collection(name=collection_name)

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


Creating collection: 'my_news'


In [16]:
print(pdf_subset)

          topic                                               link  \
0       SCIENCE  https://www.eurekalert.org/pub_releases/2020-0...   
1       SCIENCE  https://www.pulse.ng/news/world/an-irresistibl...   
2       SCIENCE  https://www.express.co.uk/news/science/1322607...   
3       SCIENCE  https://www.ndtv.com/world-news/glaciers-could...   
4       SCIENCE  https://www.thesun.ie/tech/5742187/perseid-met...   
..          ...                                                ...   
995  TECHNOLOGY  https://www.androidcentral.com/mate-40-will-be...   
996     SCIENCE  https://www.cnn.com/2020/08/17/africa/stone-ag...   
997      HEALTH  https://www.tenterfieldstar.com.au/story/68776...   
998      HEALTH  https://news.sky.com/story/coronavirus-trials-...   
999      HEALTH  https://www.techexplorist.com/study-demonstrat...   

                     domain       published_date  \
0            eurekalert.org  2020-08-06 13:59:45   
1                  pulse.ng  2020-08-12 15:14:19   
2  

In [17]:
collection.add(
    documents=pdf_subset["title"][:100].tolist(),
    metadatas=[{"topic": topic} for topic in pdf_subset["topic"][:100].tolist()],
    ids=[f"id{x}" for x in range(100)],
)

In [18]:
import json

results = collection.query(query_texts=["space"], n_results=10)

print(json.dumps(results, indent=4))

{
    "ids": [
        [
            "id72",
            "id7",
            "id30",
            "id26",
            "id23",
            "id76",
            "id69",
            "id40",
            "id47",
            "id75"
        ]
    ],
    "embeddings": null,
    "documents": [
        [
            "Beck teams up with NASA and AI for 'Hyperspace' visual album experience",
            "Orbital space tourism set for rebirth in 2021",
            "NASA drops \"insensitive\" nicknames for cosmic objects",
            "\u2018It came alive:\u2019 NASA astronauts describe experiencing splashdown in SpaceX Dragon",
            "Hubble Uses Moon As \u201cMirror\u201d to Study Earth\u2019s Atmosphere \u2013 Proxy in Search of Potentially Habitable Planets Around Other Stars",
            "Australia's small yet crucial part in the mission to find life on Mars",
            "NASA Astronauts in SpaceX Capsule Splashdown in Gulf Of Mexico",
            "SpaceX's Starship spacecraft saw 150 mete

In [19]:
collection.query(query_texts=["space"], where={"topic": "SCIENCE"}, n_results=10)

{'ids': [['id7',
   'id30',
   'id26',
   'id23',
   'id76',
   'id69',
   'id40',
   'id47',
   'id75',
   'id52']],
 'embeddings': None,
 'documents': [['Orbital space tourism set for rebirth in 2021',
   'NASA drops "insensitive" nicknames for cosmic objects',
   '‘It came alive:’ NASA astronauts describe experiencing splashdown in SpaceX Dragon',
   'Hubble Uses Moon As “Mirror” to Study Earth’s Atmosphere – Proxy in Search of Potentially Habitable Planets Around Other Stars',
   "Australia's small yet crucial part in the mission to find life on Mars",
   'NASA Astronauts in SpaceX Capsule Splashdown in Gulf Of Mexico',
   "SpaceX's Starship spacecraft saw 150 meters high",
   'NASA’s InSight lander shows what’s beneath Mars’ surface',
   'Alien base on Mercury: ET hunters claim to find huge UFO',
   'SpaceX Crew-1 mission with NASA, first fully operational crewed mission to space to launch in October']],
 'metadatas': [[{'topic': 'SCIENCE'},
   {'topic': 'SCIENCE'},
   {'topic': '