#### Import

In [1]:
import sys
import os
from pathlib import Path

# Add the Application root to Python path
ROOT_PATH = os.path.abspath(os.path.join( os.getcwd(), '..'))
sys.path.insert(0, ROOT_PATH)
print(f"Added {ROOT_PATH} to the Python path.")
from src.rag.retrieve_data import RAGRetriever

Added c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI to the Python path.


In [2]:
import os
import PyPDF2
import torch
import json
from src.models.hf_models_manager import HFModelsManager
from src.rag.retrieve_data import RAGRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb import PersistentClient
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load environment variables
load_dotenv()

True

### Setup Paths

In [4]:
#Load Env Variables
DOCS_PATH = os.path.join(ROOT_PATH,os.getenv("DOCS_PATH"))  
MODELS_PATH = os.path.join(ROOT_PATH,os.getenv("MODELS_BASE_DIR"))
VECTORSTORE_PATH = os.path.join(ROOT_PATH,os.getenv("VECTORSTORE_PATH"))
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
COLLECTION_NAME = os.getenv("CHROMADB_COLLECTION")
COLLECTION_MAPPING_PATH = os.path.join(VECTORSTORE_PATH, f"{COLLECTION_NAME}.json")
CHUNK_SIZE = os.getenv("CHUNK_SIZE")
CHUNK_OVERLAP = os.getenv("CHUNK_OVERLAP")

print(f"MODELS_PATH: {MODELS_PATH} | VECTORSTORE_PATH: {VECTORSTORE_PATH} | DOCS_PATH: {DOCS_PATH}")
print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
print(f"COLLECTION_NAME: {COLLECTION_NAME} | COLLECTION_MAPPING_PATH: {COLLECTION_MAPPING_PATH}")
print(f"CHUNK_SIZE: {CHUNK_SIZE} | CHUNK_OVERLAP: {CHUNK_OVERLAP}")

MODELS_PATH: c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\models | VECTORSTORE_PATH: c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\vectorstore | DOCS_PATH: c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\data/docs
EMBEDDING_MODEL: None
COLLECTION_NAME: None | COLLECTION_MAPPING_PATH: c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\vectorstore\None.json
CHUNK_SIZE: None | CHUNK_OVERLAP: None


In [None]:
# Initialize HFModelsManager
hf_manager = HFModelsManager(EMBEDDING_MODEL, model_path=MODELS_PATH)
model, tokenizer = hf_manager.initialize_model() 

In [6]:
def setup_chromadb():
    """Set up ChromaDB client with persistent storage."""
    client = PersistentClient(path=VECTORSTORE_PATH)
    try:
        collection = client.get_collection(COLLECTION_NAME)
    except Exception:
        collection = client.create_collection(COLLECTION_NAME)
    return client, collection

In [7]:
client , collection = setup_chromadb()

In [8]:
def get_collection_mapping():
    """Reads the Collection Master Mapping file."""
    if os.path.exists(COLLECTION_MAPPING_PATH):
        with open(COLLECTION_MAPPING_PATH, 'r') as f:
            return json.load(f)
    return []

In [None]:
collection_mapping = get_collection_mapping()
collection_mapping

In [11]:
def write_collection_mapping(game_rules):
    """Writes the updated game rules to the game_rules.json file."""
    with open(COLLECTION_MAPPING_PATH, 'w') as f:
        json.dump(game_rules, f, indent=4)

In [12]:
def get_or_create_game_id(game_name):
    """Check if a game already exists in the collection mapping and return its ID, or create a new one."""
    collection_mapping = get_collection_mapping()

    # Check if the game already exists in the JSON
    for game in collection_mapping:
        if game["Game Name"] == game_name:
            return game["ID"], collection_mapping

    # If game doesn't exist, create a new ID
    new_id = len(collection_mapping) + 1
    return new_id, collection_mapping

In [13]:
def read_pdf_sections(file_path):
    """Reads a PDF file and extracts text into sections based on predefined headings."""
    sections = {}
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    
    # Split sections based on known section titles
    section_titles = ["Overview", "Game Setup", "How to Play", "Winning the Game", "Game Strategy", "End of Game"]
    current_section = None
    for line in text.splitlines():
        if any(title in line for title in section_titles):
            current_section = line.strip()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line.strip())

    # Join lines for each section
    for section in sections:
        sections[section] = " ".join(sections[section])

    return sections

In [None]:
data = read_pdf_sections(f"{DOCS_PATH}/tic_tac_toe.pdf")
data

In [15]:
def preprocess_and_split_sections(sections, chunk_size=50, chunk_overlap=5):
    """Splits sections into chunks using the RecursiveCharacterTextSplitter with specified chunk size and overlap."""
    chunks = {}
    
    # Define the text splitter with specified chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len  # Split purely based on character count
    )
    
    for section_name, section_text in sections.items():
        # Ensure that each section is split into chunks based on chunk_size and chunk_overlap
        section_chunks = text_splitter.split_text(section_text)
        
        # Debugging output to verify if splitting works correctly
        print("*" * 100)
        print(f"Section Name: {section_name}")
        print(f"Original Section Text Length: {len(section_text)}")
        print(f"Section Text: {section_text}")
        print(f"Section Chunks ({len(section_chunks)} chunks): {section_chunks}")
        print("*" * 100)
        
        # Store the chunks by section name
        chunks[section_name] = section_chunks
    
    return chunks

In [None]:
sections = {
    "Overview": "Tic-Tac-Toe is a simple, two-player game where the objective is to be the first player to align three of your marks (either X or O) in a horizontal, vertical, or diagonal row on a 3x3 grid."
}

chunks = preprocess_and_split_sections(sections, chunk_size=10, chunk_overlap=2)

# Print the chunks for the "Overview" section
for section_name, section_chunks in chunks.items():
    print(f"Section: {section_name}")
    for i, chunk in enumerate(section_chunks):
        print(f"Chunk {i + 1}: {chunk}")

In [18]:
def create_embeddings(chunks):
    """Creates embeddings for the text chunks using the HF model."""
    embeddings = {}
    for section_name, chunk_list in chunks.items():
        embeddings[section_name] = []
        for chunk in chunk_list:
            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
                embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
                embeddings[section_name].append(embedding)
    return embeddings

In [20]:
def ingest_document(file_path):
    """Ingests each section of the PDF as separate documents with embeddings into ChromaDB."""
    # Get the game ID (either existing or new)
    game_name = os.path.basename(file_path).replace(".pdf", "")
    game_id, game_rules = get_or_create_game_id(game_name)
    
    # Read sections from the PDF
    sections = read_pdf_sections(file_path)
    
    # Preprocess and split each section into chunks of size 300
    chunks = preprocess_and_split_sections(sections, chunk_size=300, chunk_overlap=50)
    
    # Create embeddings for the chunks
    embeddings = create_embeddings(chunks)
    
    # Ingest each chunk with metadata
    for section_name, chunk_list in chunks.items():
        # Get the full section text
        full_section_text = sections[section_name]

        for i, chunk in enumerate(chunk_list):
            # Create a custom document ID in the format game_name_section_name_chunk_id
            document_id = f"{game_name}_{section_name.replace(' ', '_')}_{i}"
            
            print(f"Ingesting document {document_id}...")
            print(f"Full Section Text: {full_section_text}")
            print(f"Chunk Text: {chunk}")
            print(f"Embedding: {embeddings[section_name][i]}")
            print(f"Metadata: {game_id}, {game_name}, {section_name}")
            
            # Store both the full section text and individual chunk text
            document_metadata = {
                "Game_ID": str(game_id),
                "Game_Name": game_name,
                "Section_name": section_name,
                "Text": full_section_text,  # Store the entire section text once
                "Chunk_Text": chunk  # Store the chunk text here
            }

            # Upsert the document into ChromaDB
            collection.upsert(
                ids=[document_id],
                documents=[chunk],  # Store the chunk text in the document
                embeddings=[embeddings[section_name][i].tolist()],  # Store embeddings separately
                metadatas=[document_metadata]  # Metadata with the complete section text and chunk text
            )
    
    # Update the JSON file if it's a new game
    if not any(rule["Game Name"] == game_name for rule in game_rules):
        game_rules.append({"ID": game_id, "Game Name": game_name})
        write_collection_mapping(game_rules)

In [None]:
file_path = f"{DOCS_PATH}/tic_tac_toe.pdf"
ingest_document(file_path)

In [None]:
game_rules = get_collection_mapping()

In [22]:
def fetch_document_metadata(game_id):
    """Fetches all metadata and chunks related to a specific game ID from ChromaDB."""
    # Load the game rules JSON to find the corresponding GAME_NAME for the given game_id
    game_rules = get_collection_mapping()

    # Find the game entry with the specified ID
    game_entry = next((rule for rule in game_rules if str(rule["ID"]) == str(game_id)), None)
    
    if not game_entry:
        print(f"No game found with ID: {game_id}")
        return None

    # Get the GAME_NAME from the entry
    game_name = game_entry["Game Name"]
    
    # Set up the ChromaDB collection
    client, collection = setup_chromadb()

    # Fetch all documents from the collection
    try:
        all_documents = collection.get()  # Fetch all documents
    except Exception as e:
        print(f"Error fetching documents from ChromaDB: {e}")
        return None

    if not all_documents:
        print(f"No documents retrieved from ChromaDB for Game_Name: {game_name}")
        return None

    # Initialize the response dictionary
    response = {
        "ID": game_id,
        "Game_Name": game_name,
        "Chunks": {}
    }

    # Filter the documents whose IDs start with the format 'GAME_NAME_'
    search_pattern = f"{game_name}_"
    
    if all_documents.get("ids") and all_documents.get("metadatas"):
        for i, chunk_id in enumerate(all_documents["ids"]):
            if chunk_id.startswith(search_pattern):  # Filter by IDs starting with GAME_NAME_
                section_name = all_documents["metadatas"][i].get("Section_name", "Unknown Section")
                chunk_text = all_documents["metadatas"][i].get("Chunk_Text", "No Chunk Available")
                full_text = all_documents["metadatas"][i].get("Text", "No Full Text Available")

                # Group chunks under the same section and store the complete text once
                if chunk_id not in response["Chunks"]:
                    response["Chunks"][chunk_id] = {
                        "Section_name": section_name,
                        "Text": full_text,  # Complete section text
                        "Chunk_Text": [chunk_text]  # List of chunk texts
                    }
                else:
                    response["Chunks"][chunk_id]["Chunk_Text"].append(chunk_text)

    return response


In [23]:
meta_data = fetch_document_metadata(1)

In [None]:
meta_data

# Metadata Viewer

In [1]:
import sys
import os
from pathlib import Path

# Add the Application root to Python path
ROOT_PATH = os.path.abspath(os.path.join( os.getcwd(), '..'))
sys.path.insert(0, ROOT_PATH)
print(f"Added {ROOT_PATH} to the Python path.")
from dotenv import load_dotenv
load_dotenv()

Added c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI to the Python path.


True

In [7]:
from src.rag.retrieve_data import RAGRetriever
retriever = RAGRetriever()
code_prompt = retriever.get_metadata_prompt(1)

2024-09-07 10:48:51,439 - DEBUG - connectionpool.py - https://huggingface.co:443 "GET /api/whoami-v2 HTTP/11" 200 765
2024-09-07 10:48:51,545 - DEBUG - connectionpool.py - https://huggingface.co:443 "GET /api/models/dunzhang/stella_en_1.5B_v5/revision/main HTTP/11" 200 376631


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\erdrr\.cache\huggingface\token
Login successful
Connected to Huggingface


2024-09-07 10:48:51,616 - DEBUG - _api.py - Attempting to acquire lock 2432863477584 on c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\models\dunzhang\stella_en_1.5B_v5\.cache\huggingface\download\.gitattributes.lock
2024-09-07 10:48:51,618 - DEBUG - _api.py - Lock 2432863477584 acquired on c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\models\dunzhang\stella_en_1.5B_v5\.cache\huggingface\download\.gitattributes.lock
2024-09-07 10:48:51,618 - DEBUG - _api.py - Attempting to acquire lock 2432863483216 on c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\models\dunzhang\stella_en_1.5B_v5\.cache\huggingface\download\1_Pooling\config.json.lock
2024-09-07 10:48:51,619 - DEBUG - _api.py - Attempting to acquire lock 2432863405776 on c:\Users\erdrr\OneDrive\Desktop\KB\Projects\GameWeaverAI\models\dunzhang\stella_en_1.5B_v5\.cache\huggingface\download\2_Dense\config.json.lock
2024-09-07 10:48:51,619 - DEBUG - _api.py - Attempting to acquire lock 2432863409104 on c:\Us

Model 'dunzhang/stella_en_1.5B_v5' is available at C:\Users\erdrr\OneDrive\Desktop\KB\Projects\gameweaverai\models\dunzhang\stella_en_1.5B_v5


2024-09-07 10:48:55,185 - INFO - retrieve_data.py - Fetching metadata prompt for game ID: 1
2024-09-07 10:48:55,186 - INFO - retrieve_data.py - Fetching document metadata for game ID: 1
2024-09-07 10:48:55,187 - DEBUG - retrieve_data.py - Found game: Tic Tac Toe with ID: 1
2024-09-07 10:48:55,188 - DEBUG - retrieve_data.py - All Documents: {'ids': ['Tic Tac Toe_End_of_Game:_0', 'Tic Tac Toe_End_of_Game:_1', 'Tic Tac Toe_End_of_Game:_2', 'Tic Tac Toe_Game_Setup:_0', 'Tic Tac Toe_Game_Strategy:_0', 'Tic Tac Toe_Game_Strategy:_1', 'Tic Tac Toe_Game_Strategy:_2', 'Tic Tac Toe_How_to_Play:_0', 'Tic Tac Toe_How_to_Play:_1', 'Tic Tac Toe_Overview:_0', 'Tic Tac Toe_Winning_the_Game:_0', 'Tic Tac Toe_Winning_the_Game:_1'], 'embeddings': None, 'metadatas': [{'Chunk_Text': '- The game ends when either a player wins or the grid is completely ﬁlled with no winner. - Players can choose to play multiple rounds and keep score of wins, losses, and draws. Multiplayer Mode: - In multiplayer mode, two hum

Model and tokenizer for 'dunzhang/stella_en_1.5B_v5' initialized successfully.


In [8]:
print(code_prompt)


    You are a Python expert and you are tasked with generating the code for a game. Here are the components of the game:

    Overview: Tic-Tac-Toe is a simple, two -player game where the objective is to be the ﬁrst player to align three of your marks (either X or O) in a horizontal, vertical, or diagonal row on a 3x3 grid.
    Game Setup: - The game is played on a 3x3 grid. - Player 1 uses the symbol "X" and Player 2 uses the symbol "O". - Players take turns placing their symbol in one of the empty squares on the grid. 
    How to Play: 1. The game starts with an empty 3x3 grid. 2. Player 1 (X) makes the ﬁrst move by placing their symbol in any of the 9 squares. 3. Player 2 (O) then places their symbol in one of the remaining empty squares. 4. Players continue to alternate turns until one player achieves a winning combination or the grid is full. 
    Winning the Game: - A player wins by placing three of their symbols in a row, which can be: - Horizontally: Any of the three horizonta

2024-09-07 10:48:55,264 - DEBUG - connectionpool.py - https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15
