### testing functions from the video

In [1]:
import os
from dotenv import load_dotenv

### create the object to call LLMs from groq

In [14]:
from groq import Groq

load_dotenv()

client = Groq(
    api_key = os.getenv("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "what planet in the solar system has more moons?",
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

The planet with the most moons in our solar system is Jupiter. As of 2023, Jupiter has a total of 92 confirmed moons. The four largest moons of Jupiter, known as the Galilean moons, are Io, Europa, Ganymede, and Callisto. These four moons were discovered by Galileo Galilei in 1610 and are some of the largest moons in the solar system.

Here's a list of the top 5 planets with the most moons in our solar system:

1. Jupiter - 92 confirmed moons
2. Saturn - 83 confirmed moons
3. Uranus - 27 confirmed moons
4. Neptune - 14 confirmed moons
5. Mars - 2 confirmed moons (Phobos and Deimos)

It's worth noting that the number of moons can change as new discoveries are made, and some sources may group smaller objects like moonlets or ring particles differently. However, as of now, Jupiter has the most moons in our solar system.


### reading pdf files and trasnforming to pdf

In [6]:
import pdfplumber
current_wd = os.getcwd()
data_path = os.path.join(current_wd, 'data')
raw_documents = os.listdir(data_path)

documents = []

for file_name in raw_documents:
    with pdfplumber.open(os.path.join(data_path, file_name)) as pdf:
        text = ''
        for p in pdf.pages:
            text += p.extract_text().replace('\n', ' ')
        documents.append({"id": file_name, "text": text})
len(documents)

2

### chunk the text from the documents

In [7]:
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})


==== Splitting docs into chunks ====
==== Splitting docs into chunks ====


### generate embeddings

In [None]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# # Load Pretrained Model and Tokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

# # Example Text
# text = "Hello, how are you?"

# # Tokenize the input text
# inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [26]:
# # Run the input through the model
# with torch.inference_mode():  # Disable gradient calculation for efficiency
#     outputs = model(**inputs)

# # Extract the hidden states (last layer embeddings)
# last_hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# cls_embedding = last_hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)
# print(cls_embedding.shape)  # Output: torch.Size([1, 768])


torch.Size([1, 768])


In [9]:
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import EmbeddingFunction

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

    def __call__(self, input_texts):
        if isinstance(input_texts, str):
            input_texts = [input_texts]
        embeddings = self.model.encode(input_texts, convert_to_numpy=True)
        return embeddings.tolist()


custom_embeddings = MyEmbeddingFunction()


chroma_client = PersistentClient(path="chroma_persistent_storage")
test_collection = chroma_client.get_or_create_collection(
    name="ghg_collection",
    embedding_function=custom_embeddings
)

In [10]:

for chunk in chunked_documents:
    test_collection.add(
        ids=[chunk["id"]],
        embeddings=custom_embeddings([chunk["text"]]),
        metadatas=[{"source": chunk["id"]}],
        documents=[chunk["text"]]
    )

print(f"Total chunks stored in ChromaDB: {test_collection.count()}")

Total chunks stored in ChromaDB: 2126



### Query Documents

In [11]:
def query_documents(question, n_results=2):

    query_embedding = custom_embeddings([question])

    results = test_collection.query(query_embeddings=query_embedding, n_results=n_results)

    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks

### Generating AI Response

In [17]:
import groq

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are a digital consultant specializing in Australia's evolving greenhouse gas (GHG) emission regulations. "
        "Your task is to help companies navigate the complexities of compliance, accurate emission calculations, and industry-specific scope definitions. "
        "Use the following context to provide tailored, concise, and accurate guidance. Ensure the response is practical, actionable, and aligned with the most recent regulatory updates. "
        "If the answer is not available or unclear, state that you do not know. "
        "Use five sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )

    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": prompt},
        ],
    )

    answer = response.choices[0].message.content
    return answer

In [18]:
question = "How should my company calculate emissions for Scope 3?"
relevant_chunks = query_documents(question)
response = generate_response(question, relevant_chunks)
print(response)

==== Returning relevant chunks ====
To calculate Scope 3 emissions, your company should refer to the relevant industry segment description in Section 2 of the API Compendium and identify the operations and sources that need to be assessed. Section 3 provides guidance on equipment classification and inventory accuracy. For actual calculation methodologies, your company may need to look beyond the provided context, as it mainly focuses on direct emissions (Scope 1) from combustion devices, waste gas disposal, process and operational venting, and fugitive emission sources. I am not aware of specific guidance for Scope 3 emissions in the given context. You may need to consult additional resources or regulatory updates for accurate Scope 3 emission calculations.
