### Step 1: Reading data

In [7]:
from datasets import load_dataset

In [8]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [9]:
# Taking a sample of 100 rows
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
xsum_sample = xsum_sample[["document", "summary"]]

In [4]:
from sentence_transformers import InputExample
import pandas as pd

pdf_subset = xsum_sample

def example_create_fn(doc1: pd.Series) -> InputExample:
    """
    Helper function that outputs a sentence_transformer guid, label, and text
    """
    return InputExample(texts=[doc1])

faiss_train_examples = pdf_subset.apply(
    lambda x: example_create_fn(x["document"]), axis=1
).tolist()

### Step 2: Vectorize text into embedding vectors

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "all-MiniLM-L6-v2", 
) 
faiss_title_embedding = model.encode(pdf_subset.document.values.tolist())
len(faiss_title_embedding), len(faiss_title_embedding[0])

(1000, 384)

### Step 3: Saving embedding vectors to FAISS index

In [6]:
import numpy as np
import faiss

id_index = np.array(pdf_subset.index.values).astype("int")
content_encoded_normalized = faiss_title_embedding.copy()
faiss.normalize_L2(content_encoded_normalized)

# Index1DMap translates search results to IDs: https://faiss.ai/cpp_api/file/IndexIDMap_8h.html#_CPPv4I0EN5faiss18IndexIDMapTemplateE
# The IndexFlatIP below builds index
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_title_embedding[0])))
index_content.add_with_ids(content_encoded_normalized, id_index)

### Step 4: Search for relevant documents

In [7]:
def search_content(query, pdf_subset, k=3):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    # We set k to limit the number of vectors we want to return
    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = pdf_subset.loc[ids]
    results["similarities"] = similarities
    return results

In [8]:
search_content("harry potter", pdf_subset)

Unnamed: 0,document,summary,similarities
860,The four-part CGI animated mini-series will br...,"John Boyega, James McAvoy and Sir Ben Kingsley...",0.452037
269,"The play, written by Jack Thorne, is set 19 ye...",Harry Potter and the Cursed Child has won five...,0.409222
365,Channel 4 has also fined Glasspool an undisclo...,Hollyoaks actor Parry Glasspool has been suspe...,0.315844


In [9]:
pdf_subset.iloc[860]['summary']

'John Boyega, James McAvoy and Sir Ben Kingsley are among an all-star cast set to provide voices in a new adaptation of Watership Down.'

In [10]:
pdf_subset.iloc[269]['summary']

'Harry Potter and the Cursed Child has won five-star reviews from critics, with one describing it as "a game-changing production".'

In [11]:
pdf_subset.iloc[365]['summary']

'Hollyoaks actor Parry Glasspool has been suspended after posting a video in which he mimicked a woman threatening to stab her boyfriend to death.'

### Vector Database: Chroma

In [14]:
import chromadb
from chromadb.config import Settings

chroma_client  = chromadb.PersistentClient()

In [15]:
collection_name = "my_harry_potter_news"

if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
    chroma_client.delete_collection(name=collection_name)

print(f"Creating collection: '{collection_name}'")
collection = chroma_client.create_collection(name=collection_name)

Creating collection: 'my_harry_potter_news'


### Step 1: Add data to collection

In [16]:
display(pdf_subset)

Unnamed: 0,document,summary
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...
...,...,...
995,"If confirmed, the ""exomoon"" is likely to be ab...",A team of astronomers has potentially discover...
996,"Jacob Murphy fired in a 25-yard shot, his firs...",Norwich City made a great start to life back i...
997,"Speaking to reporters at the G20 summit, the u...",David Cameron has made an impassioned defence ...
998,"The Swans are up to 12th in the table, 13 poin...",Swansea City head coach Francesco Guidolin bel...


In [18]:
collection.add(
    documents=pdf_subset["document"][:100].tolist(),
    metadatas=[{"document": document} for document in pdf_subset["document"][:100].tolist()],
    ids=[f"id{x}" for x in range(100)],
)

/Users/linghuang/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 79.3M/79.3M [00:05<00:00, 15.7MiB/s]


### Step 2: Query for 10 relevant documents on "harry potter"

In [31]:
import json

results = collection.query(query_texts=["harry potter"], n_results=100)

print(json.dumps(results, indent=1))

{
 "ids": [
  [
   "id92",
   "id24",
   "id26",
   "id80",
   "id11",
   "id44",
   "id70",
   "id54",
   "id16",
   "id3",
   "id90",
   "id65",
   "id98",
   "id58",
   "id77",
   "id50",
   "id85",
   "id15",
   "id14",
   "id39",
   "id91",
   "id25",
   "id22",
   "id67",
   "id71",
   "id48",
   "id32",
   "id82",
   "id72",
   "id55",
   "id20",
   "id78",
   "id75",
   "id60",
   "id96",
   "id68",
   "id29",
   "id47",
   "id45",
   "id63",
   "id84",
   "id40",
   "id38",
   "id5",
   "id51",
   "id76",
   "id61",
   "id9",
   "id37",
   "id23",
   "id0",
   "id53",
   "id1",
   "id17",
   "id41",
   "id59",
   "id56",
   "id10",
   "id35",
   "id62",
   "id94",
   "id21",
   "id12",
   "id83",
   "id97",
   "id99",
   "id46",
   "id89",
   "id52",
   "id64",
   "id42",
   "id4",
   "id43",
   "id8",
   "id88",
   "id28",
   "id18",
   "id6",
   "id7",
   "id66",
   "id34",
   "id30",
   "id74",
   "id49",
   "id69",
   "id81",
   "id33",
   "id57",
   "id95",
   "id2",
   "

In [33]:
collection.query(query_texts=["harry potter"], where={"id": "id269"}, n_results=1)

{'ids': [[]],
 'distances': [[]],
 'metadatas': [[]],
 'embeddings': None,
 'documents': [[]]}

In [34]:
collection.delete(ids=["id0"])

In [35]:
collection.get(
    ids=["id0"],
)

{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': []}

In [37]:
collection.get(
    ids=["id1"],
)

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [{'document': 'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."'}],
 'documents': ['A fire alarm went off a

In [3]:
import pinecone

pinecone_api_key = "-"
pinecone_env = "default"

In [21]:
# Taking a sample of 100 rows
df = xsum_sample

In [22]:
from sentence_transformers import SentenceTransformer

# We will use embeddings from this model to apply to our data
model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)  

In [23]:
pinecone_index_name = "news"

### Create the index


We specify the index name (required), embedding vector dimension (required), and a custom similarity metric (cosine is the default) when creating our index.

In [19]:
dataset = {xsum_sample}

In [52]:
from pinecone import Pinecone

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# initialize connection to pinecone (get API key at app.pc.io)
api_key = "-"
environment = "default"

# configure client
pc = Pinecone(api_key=api_key)

In [46]:
spec = PodSpec(environment=environment)

In [50]:
index_name = 'semantic-search-test'

In [49]:
pc.list_indexes()

{'indexes': []}

In [51]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of minilm
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '8457f3d771f16efb3682760951794140', 'Date': 'Thu, 01 Feb 2024 20:30:55 GMT', 'Server': 'Google Frontend', 'Content-Length': '82', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource default not found"},"status":404}


In [40]:
pc.Index(index_name)

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '091023c3642762bfb67ada260ae3a61b', 'Date': 'Thu, 01 Feb 2024 20:24:34 GMT', 'Server': 'Google Frontend', 'Content-Length': '95', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource semantic-search-news not found"},"status":404}


In [41]:
index

NameError: name 'index' is not defined