## 1. Preprocessing tests

In [1]:
from nlp.data_preprocessing import DataPreprocessor

# Initialize the preprocessor with default paths
preprocessor = DataPreprocessor()

# Test reading PDF files
documents = preprocessor.read_pdf_files()
print(f"Number of documents processed: {len(documents)}")

# Test text splitting on the first document if any were found
if documents:
    first_doc = documents[0]
    print(f"\nProcessing document: {first_doc['id']}")
    
    # Split the text into chunks
    chunks = preprocessor.split_text(first_doc['text'])
    print(f"Number of chunks created: {len(chunks)}")
    
    # Print first chunk as sample
    if chunks:
        print("\nSample chunk (first 200 characters):")
        print(chunks[0][:200] + "...")
else:
    print("No PDF documents found in the data directory")


  from .autonotebook import tqdm as notebook_tqdm


Initializing DataPreprocessor...
The function _initialize_collection is running
The function read_pdf_files is running
Number of documents processed: 2

Processing document: C2024C00572.pdf
The function split_text is running
Number of chunks created: 297

Sample chunk (first 200 characters):
National Greenhouse and Energy Reporting Act 2007 No. 175, 2007 Compilation No. 26 Compilation date: 14 October 2024 Includes amendments: Act No. 39, 2024 Prepared by the Office of Parliamentary Couns...


## 2. ChatBot tests

### testing functions from the video

In [1]:
import os
from dotenv import load_dotenv

### create the object to call LLMs from groq

In [2]:
from groq import Groq

load_dotenv()

client = Groq(
    api_key = os.getenv("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "what planet in the solar system has more moons?",
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

The planet with the most moons in our solar system is Jupiter. As of now, Jupiter has a total of 92 confirmed moons, with many more smaller, irregular moons waiting to be confirmed. 

Some of the largest and most notable moons of Jupiter include:
- Io
- Europa
- Ganymede
- Callisto

These four moons are known as the Galilean moons, as they were discovered by Galileo Galilei in 1610. They are some of the largest moons in the solar system and offer valuable insights into Jupiter's formation and evolution.

Saturn is the second planet with the most moons, having a total of 83 confirmed moons. The other planets in our solar system have fewer moons, with Uranus having 27 and Neptune having 14.


### reading pdf files and trasnforming to pdf

In [3]:
import pdfplumber
current_wd = os.getcwd()
data_path = os.path.join(current_wd, 'data')
raw_documents = os.listdir(data_path)

documents = []

for file_name in raw_documents:
    with pdfplumber.open(os.path.join(data_path, file_name)) as pdf:
        text = ''
        for p in pdf.pages:
            text += p.extract_text().replace('\n', ' ')
        documents.append({"id": file_name, "text": text})
len(documents)

2

### chunk the text from the documents

In [4]:
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})


==== Splitting docs into chunks ====
==== Splitting docs into chunks ====


### generate embeddings

In [None]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# # Load Pretrained Model and Tokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

# # Example Text
# text = "Hello, how are you?"

# # Tokenize the input text
# inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [26]:
# # Run the input through the model
# with torch.inference_mode():  # Disable gradient calculation for efficiency
#     outputs = model(**inputs)

# # Extract the hidden states (last layer embeddings)
# last_hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# cls_embedding = last_hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)
# print(cls_embedding.shape)  # Output: torch.Size([1, 768])


torch.Size([1, 768])


In [5]:
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import EmbeddingFunction

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

    def __call__(self, input_texts):
        if isinstance(input_texts, str):
            input_texts = [input_texts]
        embeddings = self.model.encode(input_texts, convert_to_numpy=True)
        return embeddings.tolist()


custom_embeddings = MyEmbeddingFunction()


chroma_client = PersistentClient(path="chroma_persistent_storage")
test_collection = chroma_client.get_or_create_collection(
    name="ghg_collection",
    embedding_function=custom_embeddings
)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:

for chunk in chunked_documents:
    test_collection.add(
        ids=[chunk["id"]],
        embeddings=custom_embeddings([chunk["text"]]),
        metadatas=[{"source": chunk["id"]}],
        documents=[chunk["text"]]
    )

print(f"Total chunks stored in ChromaDB: {test_collection.count()}")

Total chunks stored in ChromaDB: 2126



### Query Documents

In [6]:
def query_documents(question, n_results=2):

    query_embedding = custom_embeddings([question])

    results = test_collection.query(query_embeddings=query_embedding, n_results=n_results)

    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks

### Generating AI Response

In [7]:
import groq

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are a digital consultant specializing in Australia's evolving greenhouse gas (GHG) emission regulations. "
        "Your task is to help companies navigate the complexities of compliance, accurate emission calculations, and industry-specific scope definitions. "
        "Use the following context to provide tailored, concise, and accurate guidance. Ensure the response is practical, actionable, and aligned with the most recent regulatory updates. "
        "If the answer is not available or unclear, state that you do not know. "
        "Use five sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )

    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": prompt},
        ],
    )

    answer = response.choices[0].message.content
    return answer

In [8]:
question = "How should my company calculate emissions for Scope 3?"
relevant_chunks = query_documents(question)
response = generate_response(question, relevant_chunks)
print(response)

==== Returning relevant chunks ====
To calculate Scope 3 emissions, your company should follow the Australian Government's National Greenhouse and Energy Reporting (NGER) scheme and the GHG Protocol Corporate Standard. Scope 3 emissions include indirect emissions from sources not owned or controlled by your company, such as supply chain, transportation, and employee commuting. You can use the GHG Protocol's Scope 3 calculation guidance and tools to estimate these emissions. The Australian Government also provides resources and tools to support Scope 3 emissions calculation, including the NGER Scope 3 Emissions Estimation Tool. It is recommended to consult with a specialist or the relevant Australian authorities to ensure compliance with the latest regulatory requirements.
