### testing functions from the video

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from chromadb.utils import embedding_functions

In [None]:
load_dotenv()
openai_key = os.getenv("API_KEY")

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-3-small"
)


### initializa chroma client with persistence

In [4]:
from chromadb import PersistentClient
chroma_client = PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef
)



### create the object to call LLMs from groq

In [10]:
from groq import Groq

load_dotenv()

client = Groq(
    api_key = os.getenv("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "what planet in the solar system has more moons?",
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

The planet in our solar system with the most moons is Jupiter. As of my knowledge cutoff in 2023, Jupiter has a total of 92 confirmed moons. However, it's essential to note that this number might change over time as new discoveries are made or classifications are updated.

Some of the most notable moons of Jupiter include:
1. Ganymede: The largest moon in the solar system, even bigger than the planet Mercury.
2. Io: The most volcanically active body in the solar system.
3. Europa: A moon with a potential subsurface ocean, making it a fascinating target for astrobiological research.
4. Callisto: The outermost of the four largest moons of Jupiter, known for its cratered surface.

Saturn is the second planet with the most moons, with a total of 83 confirmed moons as of my knowledge cutoff in 2023. The other planets in our solar system have significantly fewer moons.


### reading pdf files and trasnforming to pdf

In [20]:
import pdfplumber
current_wd = os.getcwd()
data_path = os.path.join(current_wd, 'data')
raw_documents = os.listdir(data_path)

documents = []

for file_name in raw_documents:
    with pdfplumber.open(os.path.join(data_path, file_name)) as pdf:
        text = ''
        for p in pdf.pages:
            text += p.extract_text().replace('\n', ' ')
        documents.append({"id": file_name, "text": text})
len(documents)

2

### chunk the text from the documents

In [None]:
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})


==== Splitting docs into chunks ====
==== Splitting docs into chunks ====


2126

### generate embeddings

In [23]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load Pretrained Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Example Text
text = "Hello, how are you?"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# Run the input through the model
with torch.inference_mode():  # Disable gradient calculation for efficiency
    outputs = model(**inputs)

# Extract the hidden states (last layer embeddings)
last_hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

cls_embedding = last_hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)
print(cls_embedding.shape)  # Output: torch.Size([1, 768])


torch.Size([1, 768])
