In [None]:
from langchain_community.document_loaders import (
    BSHTMLLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import time
from pinecone import Pinecone
from pinecone import PodSpec


In [None]:

folder_path = "./toc_notifications_2023_1991/rbi_notification_2023_1991"
files = os.listdir(folder_path)

loaders = [
    BSHTMLLoader(os.path.join(folder_path, file))
    for file in files
    if file.endswith(".html")
]
# content = [loader.load() for loader in loaders]
content = [doc for loader in loaders for doc in loader.load()]

In [None]:
text_split = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_split.split_documents(content)

In [None]:
chunks_text = [doc.page_content for doc in chunks]

type(chunks_text) #list

In [None]:
from dotenv import load_dotenv

load_dotenv()

OpenAI = os.getenv("OpenAI")
pinecone = os.getenv("pinecone")
OpenAI , pinecone

In [None]:
from openai import OpenAI

client = OpenAI(
  api_key="...."
)

In [None]:
MODEL = "text-embedding-3-small"

response = client.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=MODEL
)


In [None]:
MODEL = "text-embedding-3-small"

print(type(chunks_text))  # Check the type of chunks_text
print(chunks_text)  # Print the content of chunks_text

# Ensure chunks_text is a list of strings
if not all(isinstance(item, str) for item in chunks_text):
  chunks_text = [str(item) for item in chunks_text]

# Call the API with the corrected chunks_text
res = client.embeddings.create(
  input=chunks_text,
  model=MODEL,
)

res

In [None]:
embeds = [record.embedding for record in res.data]
len(embeds) # 1470
# embeds

In [None]:
# Define the number of tokens per page
tokens_per_page = 800

# Define the number of pages per dollar
pages_per_dollar = 62500

# Calculate the number of pages for 40 million tokens
total_pages = 40000000 / tokens_per_page

# Calculate the total cost
total_cost = total_pages / pages_per_dollar

# Print the total cost
print(total_cost)

In [None]:
pc = Pinecone(api_key=pinecone)

spec = PodSpec(environment="gcp-starter")

index_name = "rbi-notification"

if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(embeds[0]),  # dimensionality of text-embed-3-small
        metric="dotproduct",
        spec=spec,
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

In [None]:
from datasets import load_dataset

trec = load_dataset("trec", split="train[:1000]")

In [None]:
from tqdm.auto import tqdm

count = 0
batch_size = 100

for i in tqdm(range(0, len(trec["text"]), batch_size)):
    i_end = min(i + batch_size, len(trec["text"]))
    lines_batch = trec["text"][i : i + batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{"text": line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

In [None]:
query = "what was the notification in 2004 by rbi?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding
xq


In [None]:
# res = index.query([xq], top_k=5, include_metadata=True)
res = index.query(vector=[xq], top_k=5, include_metadata=True)

In [None]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")