In [28]:
# %%bash
# pip install chromadb

In [2]:
import env

In [3]:
import pandas as pd
df = pd.read_csv('nyt-metadata.csv', nrows=5000, na_values = ['[]', "", " "])
print(df.columns)

Index(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section',
       'print_page', 'source', 'multimedia', 'headline', 'keywords',
       'pub_date', 'document_type', 'news_desk', 'section_name', 'byline',
       'type_of_material', '_id', 'word_count', 'uri', 'subsection_name'],
      dtype='object')


In [4]:
for col in df.columns:
    print(col , ": \t" ,df[col][6])

abstract : 	 Article on Florida State offensive tackle Todd Williams notes that he earned football scholarship even though he was living on streets when he was 15; photo (M)
web_url : 	 https://www.nytimes.com/2000/01/01/sports/college-football-from-homeless-to-a-home-at-florida-state.html
snippet : 	 Article on Florida State offensive tackle Todd Williams notes that he earned football scholarship even though he was living on streets when he was 15; photo (M)
lead_paragraph : 	 The former high school superstars thought they had it tough. One by one, the Florida State freshman football players vented their frustrations last year during their first meeting with the university's director of student development, Pam Overton:
print_section : 	 D
print_page : 	 3.0
source : 	 The New York Times
multimedia : 	 nan
headline : 	 {'main': 'COLLEGE FOOTBALL; From Homeless to a Home at Florida State', 'kicker': None, 'content_kicker': None, 'print_headline': 'COLLEGE FOOTBALL; From Homeless to a H

In [5]:
df.columns

Index(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section',
       'print_page', 'source', 'multimedia', 'headline', 'keywords',
       'pub_date', 'document_type', 'news_desk', 'section_name', 'byline',
       'type_of_material', '_id', 'word_count', 'uri', 'subsection_name'],
      dtype='object')

In [6]:
required = ['uri', 'headline', 'abstract', 'lead_paragraph']
df = df[required]
df.dropna(inplace = True)

In [7]:
df["article_id"] = df['uri'].str.split("/").apply(lambda x: x[-1])

In [8]:
df.headline = df.headline.apply(lambda x: eval(x)['main'])

In [9]:
df.head(3)

Unnamed: 0,uri,headline,abstract,lead_paragraph,article_id
0,nyt://article/01111a48-3502-5021-8096-bc929379...,"Playoffs or No, Dallas Provides The Motivation",Article on upcoming New York Giants-Dallas Cow...,Waiting in the visiting locker room at Texas S...,01111a48-3502-5021-8096-bc9293797d54
1,nyt://article/02328edc-dad1-5eb0-900e-917162e4...,"On This First Day, a Fanfare for the New Era; ...",Jeanne C Pond letter expresses hope that spiri...,To the Editor:,02328edc-dad1-5eb0-900e-917162e46dcd
2,nyt://article/02a8f89b-153f-5b84-983c-e328de5b...,Internet's Cheering Squad Nervously Watches Clock,Many experts on Y2K computer problem report th...,As the world slid nervously yesterday through ...,02a8f89b-153f-5b84-983c-e328de5bf811


In [10]:
corpus = df['headline'].str.lower() + "\n" + df['abstract'].str.lower() + "\n" + df['lead_paragraph'].str.lower()

# Model VertexAI

In [11]:
from typing import List, Optional

# Load the "Vertex AI Embeddings for Text" model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

Define two more helper functions for converting text to embeddings

* generate_batches: According to the documentation, each request can handle up to 5 text instances. Therefore, this method splits sentences into batches of 5 before sending to the embedding API.
* encode_text_to_embedding_batched: This method calls generate_batches to handle batching and then calls the embedding API via encode_texts_to_embeddings. It also handles rate-limiting using time.sleep. For production use cases, you would want a more sophisticated rate-limiting mechanism that takes retries into account.



In [12]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple

import numpy as np
from tqdm.auto import tqdm

import math
# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [13]:
# Encode a subset of questions for validation
is_successful, embeddings = encode_text_to_embedding_batched(
    sentences=corpus
)

  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
ids = df.article_id.astype(str).tolist()
metadata = df.to_dict(orient='records')
metadata

[{'uri': 'nyt://article/01111a48-3502-5021-8096-bc9293797d54',
  'headline': 'Playoffs or No, Dallas Provides The Motivation',
  'abstract': 'Article on upcoming New York Giants-Dallas Cowboys game; photo (M)',
  'lead_paragraph': 'Waiting in the visiting locker room at Texas Stadium late tomorrow afternoon, the Giants will know whether the Green Bay Packers, who play earlier against Arizona, have won or are comfortably ahead.',
  'article_id': '01111a48-3502-5021-8096-bc9293797d54'},
 {'uri': 'nyt://article/02328edc-dad1-5eb0-900e-917162e46dcd',
  'headline': 'On This First Day, a Fanfare for the New Era; Knowing the World',
  'abstract': 'Jeanne C Pond letter expresses hope that spiritual development, artistic knowledge and skills and self-esteem flourish in new century; drawing',
  'lead_paragraph': 'To the Editor:',
  'article_id': '02328edc-dad1-5eb0-900e-917162e46dcd'},
 {'uri': 'nyt://article/02a8f89b-153f-5b84-983c-e328de5bf811',
  'headline': "Internet's Cheering Squad Nervous

# Initialize Vector Database

In [32]:
import chromadb
client = chromadb.PersistentClient('chromadb')
collection = client.create_collection("vector_db")

In [22]:
collection.add(
    embeddings = embeddings.tolist(),
    metadatas=metadata,
    ids=ids
)

In [23]:
collection

Collection(name=vector-db)

In [None]:
user_perference = input("What kind of News do you like ? ")
_, vector = encode_text_to_embedding_batched(
    sentences=[user_perference]
)
results=collection.query(    
    query_embeddings=vector.tolist(),
    n_results=5)

In [27]:
results

{'ids': [['01111a48-3502-5021-8096-bc9293797d54',
   '07865d60-30eb-55ce-a609-d9f1a5d8ab5b',
   '09a24998-4414-5b07-adb4-7c307e364ce5',
   '0654cc64-c37f-594d-9290-1ce578cd9c7a',
   '02328edc-dad1-5eb0-900e-917162e46dcd']],
 'distances': [[0.7745091542354977,
   0.8418997007391053,
   0.9059741735082196,
   0.9169106296502243,
   0.9391524051940504]],
 'metadatas': [[{'abstract': 'Article on upcoming New York Giants-Dallas Cowboys game; photo (M)',
    'article_id': '01111a48-3502-5021-8096-bc9293797d54',
    'headline': 'Playoffs or No, Dallas Provides The Motivation',
    'lead_paragraph': 'Waiting in the visiting locker room at Texas Stadium late tomorrow afternoon, the Giants will know whether the Green Bay Packers, who play earlier against Arizona, have won or are comfortably ahead.',
    'uri': 'nyt://article/01111a48-3502-5021-8096-bc9293797d54'},
   {'abstract': 'Article on Florida State offensive tackle Todd Williams notes that he earned football scholarship even though he was