In [1]:
# CELL 1: Import all required libraries

import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings  # ‚úÖ Local embeddings
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

print("‚úÖ All imports successful!")


  from pydantic.v1.fields import FieldInfo as FieldInfoV1


‚úÖ All imports successful!


In [2]:
# CELL 2: Load API keys from .env file

load_dotenv()

# Verify keys are loaded
gemini_key = os.getenv('GEMINI_API_KEY')
groq_key = os.getenv('GROQ_API_KEY')

if gemini_key:
    print("‚úÖ Gemini API key loaded")
if groq_key:
    print("‚úÖ Groq API key loaded")
    
if not gemini_key and not groq_key:
    print("‚ö†Ô∏è No API keys found! Create .env file with your keys")


‚úÖ Gemini API key loaded
‚úÖ Groq API key loaded


In [4]:
# CELL 3: Configure LLM with GROQ (14,400 requests/day!)

# Use Groq for unlimited free usage
MODEL_CHOICE = "groq"

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    groq_api_key=os.getenv('GROQ_API_KEY'),
    temperature=0.3
)

print("üöÄ Using Groq Llama 3.1 8B for chat")
print("   - Free tier: 14,400 requests/day (720x more than Gemini!)")
print(f"   - Model: {llm.model_name}")

# LOCAL EMBEDDINGS (still unlimited)
print("\nüì• Loading local embedding model...")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

print("\n‚úÖ Setup complete!")
print("   - LLM: Groq Llama 3.1 8B (14,400/day)")
print("   - Embeddings: Local (unlimited)")


üöÄ Using Groq Llama 3.1 8B for chat
   - Free tier: 14,400 requests/day (720x more than Gemini!)
   - Model: llama-3.1-8b-instant

üì• Loading local embedding model...


  embeddings = HuggingFaceEmbeddings(


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



‚úÖ Setup complete!
   - LLM: Groq Llama 3.1 8B (14,400/day)
   - Embeddings: Local (unlimited)


In [5]:
# CELL 4: Load all PDF dictionaries from 'dictionaries/' folder
# ‚ö†Ô∏è IMPORTANT: Put your PDF files in 'dictionaries/' folder first!

print("üìö Loading Ilonggo dictionaries from 'dictionaries/' folder...")
print("   (Any PDF filename works - all PDFs will be loaded)")

# Load all PDFs from dictionaries folder
loader = PyPDFDirectoryLoader("./dictionaries/")

try:
    documents = loader.load()
    print(f"\n‚úÖ Loaded {len(documents)} pages from PDF dictionaries")
    
    # Show first 200 characters to verify
    if documents:
        print(f"\nüìñ Sample text from first page:")
        print(f"{documents[0].page_content[:200]}...")
    else:
        print("\n‚ö†Ô∏è No PDFs found!")
        print("   1. Create 'dictionaries/' folder in project root")
        print("   2. Add your Ilonggo dictionary PDFs to it")
        print("   3. Run this cell again")
        
except Exception as e:
    print(f"\n‚ùå Error loading PDFs: {e}")
    print("\nTroubleshooting:")
    print("  1. Make sure 'dictionaries/' folder exists")
    print("  2. Check PDFs are not password-protected")
    print("  3. Verify file paths are correct")


üìö Loading Ilonggo dictionaries from 'dictionaries/' folder...
   (Any PDF filename works - all PDFs will be loaded)

‚úÖ Loaded 596 pages from PDF dictionaries

üìñ Sample text from first page:
English ‚Äì Hiligaynon (Ilongo)
a ( indefinite article) isa 
aback ( to be taken aback) palak 
abandon pabayaan , abandonar 
abandoned sim-ong 
abatoir ihawan 
abbreviation lip-ot 
ABC abakada 
abdomen ...


In [6]:
# CELL 5: Split dictionary pages into searchable chunks

print("‚úÇÔ∏è Splitting dictionary into chunks...")

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,        # Small chunks for dictionary entries
    chunk_overlap=50,      # Overlap to avoid cutting words mid-definition
    separators=["\n\n", "\n", ".", " "],  # Split on paragraphs first
    length_function=len
)

# Split documents
chunks = text_splitter.split_documents(documents)

print(f"\n‚úÖ Created {len(chunks)} searchable chunks")
print(f"   - Average chunk size: ~500 characters")
print(f"   - Overlap: 50 characters")

# Show sample chunk
print(f"\nüìù Sample chunk preview:")
print(f"{chunks[0].page_content[:150]}...")

# Show statistics
total_chars = sum(len(chunk.page_content) for chunk in chunks)
avg_chunk_size = total_chars // len(chunks) if chunks else 0
print(f"\nüìä Statistics:")
print(f"   - Total chunks: {len(chunks)}")
print(f"   - Total characters: {total_chars:,}")
print(f"   - Average chunk size: {avg_chunk_size} chars")


‚úÇÔ∏è Splitting dictionary into chunks...

‚úÖ Created 9622 searchable chunks
   - Average chunk size: ~500 characters
   - Overlap: 50 characters

üìù Sample chunk preview:
English ‚Äì Hiligaynon (Ilongo)
a ( indefinite article) isa 
aback ( to be taken aback) palak 
abandon pabayaan , abandonar 
abandoned sim-ong 
abatoir ...

üìä Statistics:
   - Total chunks: 9622
   - Total characters: 4,501,736
   - Average chunk size: 467 chars


In [7]:
# CELL 6: Create searchable vector database with FAISS and local embeddings
# This creates the RAG index - run this ONCE, then use Cell 6B to reload

print("üîç Creating searchable vector database with FAISS...")
print("‚è≥ This may take 1-3 minutes for large dictionaries...")
print("   (Using LOCAL embeddings - no rate limits!)")

import time
start_time = time.time()

# Create vector store with FAISS
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

# Save to disk (so you can reload later without reprocessing)
vectorstore.save_local("faiss_index")

elapsed_time = time.time() - start_time

print(f"\n‚úÖ Vector database created with FAISS!")
print(f"   - Time taken: {elapsed_time:.1f} seconds")
print(f"   - Stored in: ./faiss_index/")
print(f"   - Indexed {len(chunks)} dictionary entries")
print(f"   - Total vectors: {vectorstore.index.ntotal}")
print("\nüí° Next time, use Cell 6B to load instantly from disk!")


üîç Creating searchable vector database with FAISS...
‚è≥ This may take 1-3 minutes for large dictionaries...
   (Using LOCAL embeddings - no rate limits!)

‚úÖ Vector database created with FAISS!
   - Time taken: 122.8 seconds
   - Stored in: ./faiss_index/
   - Indexed 9622 dictionary entries
   - Total vectors: 9622

üí° Next time, use Cell 6B to load instantly from disk!


In [8]:
# CELL 6B: Load existing FAISS database from disk
# Use this INSTEAD of Cell 6 after you've created the database once
# This is much faster than recreating the database!

print("üîÑ Loading existing FAISS vector database from disk...")

try:
    vectorstore = FAISS.load_local(
        "faiss_index", 
        embeddings,
        allow_dangerous_deserialization=True  # Safe for your own data
    )
    
    print(f"\n‚úÖ Vector database loaded from disk!")
    print(f"   - Location: ./faiss_index/")
    print(f"   - Total vectors: {vectorstore.index.ntotal}")
    print("   - Ready to use!")
    print("\nüí° This is much faster than re-processing PDFs!")
    
except FileNotFoundError:
    print("\n‚ùå FAISS index not found!")
    print("   Run Cell 6 first to create the database.")
except Exception as e:
    print(f"\n‚ùå Error loading database: {e}")
    print("   You may need to recreate it with Cell 6.")


üîÑ Loading existing FAISS vector database from disk...

‚úÖ Vector database loaded from disk!
   - Location: ./faiss_index/
   - Total vectors: 9622
   - Ready to use!

üí° This is much faster than re-processing PDFs!


In [9]:
# CELL 7: Build the translation chain using LangChain Expression Language (LCEL)

print("üîó Building retrieval chain...")

# Create retriever from vectorstore
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Return top 3 most relevant dictionary entries
)

# Custom prompt template for translation
template = """You are an Ilonggo-English dictionary assistant. Use the dictionary entries below to help translate.

Dictionary Context:
{context}

User Question: {question}

Instructions:
- If the word is in the dictionary, provide the definition/translation
- If it's English to Ilonggo, search for the English word
- If it's Ilonggo to English, search for the Ilonggo word
- If not found, say "I couldn't find that word in the dictionary"
- Be helpful, friendly, and concise
- Provide pronunciation help if available in the dictionary

Translation:"""

prompt = ChatPromptTemplate.from_template(template)

# Function to format retrieved documents
def format_docs(docs):
    """Combine multiple dictionary entries into context"""
    return "\n\n".join(doc.page_content for doc in docs)

# Build the translation chain using LCEL
translator_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("\n‚úÖ Ilonggo Translator Chatbot is ready!")
print("\nüí¨ Example questions you can ask:")
print("   - 'What does mahal mean?'")
print("   - 'How do you say love in Ilonggo?'")
print("   - 'Translate pagkaon to English'")
print("   - 'What is the Ilonggo word for beautiful?'")


üîó Building retrieval chain...

‚úÖ Ilonggo Translator Chatbot is ready!

üí¨ Example questions you can ask:
   - 'What does mahal mean?'
   - 'How do you say love in Ilonggo?'
   - 'Translate pagkaon to English'
   - 'What is the Ilonggo word for beautiful?'


In [10]:
# CELL 8: Test function to translate and show results

def translate(question):
    """
    Translate between Ilonggo and English
    Shows the answer and source documents used
    """
    print(f"\n{'='*60}")
    print(f"üáµüá≠ YOU: {question}")
    print(f"{'='*60}")
    
    # Get translation
    answer = translator_chain.invoke(question)
    
    print(f"üí¨ TRANSLATOR: {answer}")
    
    # Show source documents used
    docs = retriever.invoke(question)
    print(f"\nüìö Used {len(docs)} dictionary entries")
    print(f"{'='*60}")
    
    return answer, docs

# Run test queries
test_queries = [
    "What does 'mahal' mean in English?",
    "How do you say 'hello' in Ilonggo?",
    "Translate 'kumusta' to English"
]

print("üß™ Running test translations...\n")
for query in test_queries:
    translate(query)
    print()  # Add spacing between tests


üß™ Running test translations...


üáµüá≠ YOU: What does 'mahal' mean in English?
üí¨ TRANSLATOR: The word 'mahal' is found in the dictionary. 

According to the dictionary, 'mahal' means: 
Dear, high-priced, expensive, costly, precious, esteemed, valuable, estimable; to make or become dear, cost much; to appreciate, esteem highly.

Pronunciation: mah-HAHL

Note: The dictionary also has a diminutive form 'mah√°l-mah√°l' which means rather dear, expensive, costly, precious, valuable.

üìö Used 3 dictionary entries


üáµüá≠ YOU: How do you say 'hello' in Ilonggo?
üí¨ TRANSLATOR: I'd be happy to help you with your Ilonggo-English dictionary questions.

However, I noticed you didn't ask a question about a specific word. Could you please ask how to say "hello" in Ilonggo, and I'll be more than happy to assist you?

If you'd like to ask a different question, feel free to do so, and I'll do my best to help.

(If you'd like to ask about a specific word, just let me know, and I'll be ha

In [11]:
# CELL 9: Interactive chat loop for continuous translation

print("="*60)
print("üáµüá≠ ILONGGO DICTIONARY TRANSLATOR")
print("="*60)
print("Ask me to translate between Ilonggo and English!")
print("\nCommands:")
print("  - Type your question to translate")
print("  - Type 'sources' to see dictionary sources for last query")
print("  - Type 'quit' or 'exit' to end")
print("="*60)

# Store last query for source viewing
last_query = None

while True:
    user_input = input("\nüáµüá≠ You: ").strip()
    
    # Check for exit commands
    if user_input.lower() in ['quit', 'exit', 'q']:
        print("\nüëã Salamat! (Thank you!)")
        print("Goodbye! üáµüá≠")
        break
    
    # Skip empty input
    if not user_input:
        continue
    
    # Special command to show sources from last query
    if user_input.lower() == 'sources':
        if last_query:
            docs = retriever.invoke(last_query)
            print(f"\nüìö Dictionary sources for '{last_query}':")
            print("="*60)
            for i, doc in enumerate(docs, 1):
                print(f"\nSource {i}:")
                print(doc.page_content[:300])
                print("-"*60)
        else:
            print("‚ö†Ô∏è No previous query. Ask a question first!")
        continue
    
    # Process translation
    try:
        # Get translation
        answer = translator_chain.invoke(user_input)
        print(f"\nüí¨ Translator: {answer}")
        
        # Store query for potential source viewing
        last_query = user_input
        
        # Ask if user wants to see sources
        show_sources = input("\nüìö Show dictionary sources? (y/n): ").strip().lower()
        if show_sources == 'y':
            docs = retriever.invoke(user_input)
            print(f"\nüìñ Found {len(docs)} relevant entries:")
            print("="*60)
            for i, doc in enumerate(docs, 1):
                print(f"\nSource {i}:")
                print(doc.page_content[:300])
                if len(doc.page_content) > 300:
                    print("...")
                print("-"*60)
    
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        print("Try rephrasing your question or check your connection.")


üáµüá≠ ILONGGO DICTIONARY TRANSLATOR
Ask me to translate between Ilonggo and English!

Commands:
  - Type your question to translate
  - Type 'sources' to see dictionary sources for last query
  - Type 'quit' or 'exit' to end

üí¨ Translator: I'd be happy to help you with the translation.

The word "nakatungdan" is in the dictionary. 

According to the dictionary, "nakatungdan" is the past tense of the word "tungdan," which is not directly listed in the dictionary. However, the word "tungdan" seems to be related to the word "t√°hud," which means "To agree, pull well together, live in peace or amity." 

Since "t√°hud" is the closest match, I'll provide the translation for "nakatungdan" based on its possible relationship with "t√°hud."

If "nakatungdan" is indeed related to "t√°hud," it might mean "to agree" or "to be in harmony" in the past tense. However, please note that this is an educated guess, and the actual meaning of "nakatungdan" might be different.

If you have more context

In [13]:
# CELL 10: Direct search function for exploring the dictionary

def search_dictionary(word, k=5):
    """
    Search the vector database directly for specific words
    Returns the k most similar dictionary entries
    """
    results = vectorstore.similarity_search(word, k=k)
    
    print(f"\nüîç Searching dictionary for: '{word}'")
    print("="*60)
    
    if not results:
        print("‚ùå No matches found")
        return []
    
    print(f"‚úÖ Found {len(results)} relevant entries:\n")
    
    for i, doc in enumerate(results, 1):
        print(f"üìñ Result {i} (Relevance rank: {i}/{k}):")
        print("-"*60)
        print(doc.page_content)
        print("-"*60)
        print()
    
    return results

# Example usage (uncomment to test):
# search_dictionary("mahal", k=3)

print("‚úÖ Search function ready!")
print("\nUsage examples:")
print("  search_dictionary('love', k=3)")
print("  search_dictionary('kumusta', k=5)")
print("  search_dictionary('maganda', k=2)")


‚úÖ Search function ready!

Usage examples:
  search_dictionary('love', k=3)
  search_dictionary('kumusta', k=5)
  search_dictionary('maganda', k=2)


In [14]:
# CELL 11: Batch translate multiple words at once

def batch_translate(words_list):
    """
    Translate multiple words in one go
    Useful for learning vocabulary lists
    """
    print(f"\nüìù Batch Translation ({len(words_list)} words)")
    print("="*60)
    
    results = []
    
    for i, word in enumerate(words_list, 1):
        print(f"\n{i}. Translating '{word}'...")
        try:
            answer = translator_chain.invoke(f"What does {word} mean?")
            results.append({"word": word, "translation": answer, "status": "success"})
            print(f"   ‚úÖ {answer[:100]}{'...' if len(answer) > 100 else ''}")
        except Exception as e:
            results.append({"word": word, "translation": str(e), "status": "error"})
            print(f"   ‚ùå Error: {e}")
    
    print("\n" + "="*60)
    print("‚úÖ Batch translation complete!")
    
    return results

# Example usage (uncomment to test):
# words_to_translate = ["mahal", "kumusta", "salamat", "maganda"]
# results = batch_translate(words_to_translate)

print("‚úÖ Batch translation function ready!")
print("\nUsage:")
print("  words = ['mahal', 'kumusta', 'salamat']")
print("  results = batch_translate(words)")


‚úÖ Batch translation function ready!

Usage:
  words = ['mahal', 'kumusta', 'salamat']
  results = batch_translate(words)


In [15]:
# CELL 12: Complete system test and diagnostics

print("üß™ Running complete system diagnostics...\n")
print("="*60)

# Test 1: Check vectorstore
print("Test 1: Vector Database")
try:
    test_results = vectorstore.similarity_search("test", k=1)
    print(f"‚úÖ Vector database operational")
    print(f"   - Total vectors: {vectorstore.index.ntotal}")
    print(f"   - Test query returned: {len(test_results)} results")
except Exception as e:
    print(f"‚ùå Vector database error: {e}")

# Test 2: Check retriever
print("\nTest 2: Retriever")
try:
    test_retrieval = retriever.invoke("hello")
    print(f"‚úÖ Retriever operational")
    print(f"   - Retrieved: {len(test_retrieval)} documents")
    print(f"   - Average doc length: {sum(len(d.page_content) for d in test_retrieval) // len(test_retrieval)} chars")
except Exception as e:
    print(f"‚ùå Retriever error: {e}")

# Test 3: Check embeddings
print("\nTest 3: Embeddings")
try:
    test_embedding = embeddings.embed_query("test")
    print(f"‚úÖ Embeddings operational")
    print(f"   - Embedding dimensions: {len(test_embedding)}")
    print(f"   - Model: sentence-transformers/all-MiniLM-L6-v2")
    print(f"   - Rate limits: UNLIMITED (local)")
except Exception as e:
    print(f"‚ùå Embeddings error: {e}")

# Test 4: Check LLM
print("\nTest 4: Language Model")
try:
    test_response = llm.invoke("Say hello in one word")
    print(f"‚úÖ LLM operational")
    print(f"   - Model: {MODEL_CHOICE}")
    print(f"   - Test response: {test_response.content[:50]}")
except Exception as e:
    print(f"‚ùå LLM error: {e}")

# Test 5: Check full translation chain
print("\nTest 5: Translation Chain")
try:
    test_translation = translator_chain.invoke("What does test mean?")
    print(f"‚úÖ Translation chain operational")
    print(f"   - Sample response: {test_translation[:80]}...")
except Exception as e:
    print(f"‚ùå Translation chain error: {e}")

# System summary
print("\n" + "="*60)
print("üìä SYSTEM STATUS")
print("="*60)
print(f"‚úÖ Vector Database: {vectorstore.index.ntotal:,} entries indexed")
print(f"‚úÖ Embeddings: Local (unlimited usage)")
print(f"‚úÖ LLM: {MODEL_CHOICE} (API-based)")
print(f"‚úÖ Total chunks: {len(chunks):,}")
print("="*60)
print("üéâ All systems operational! Ready to translate! üáµüá≠")
print("="*60)


üß™ Running complete system diagnostics...

Test 1: Vector Database
‚úÖ Vector database operational
   - Total vectors: 9622
   - Test query returned: 1 results

Test 2: Retriever
‚úÖ Retriever operational
   - Retrieved: 3 documents
   - Average doc length: 371 chars

Test 3: Embeddings
‚úÖ Embeddings operational
   - Embedding dimensions: 384
   - Model: sentence-transformers/all-MiniLM-L6-v2
   - Rate limits: UNLIMITED (local)

Test 4: Language Model
‚úÖ LLM operational
   - Model: groq
   - Test response: Hello.

Test 5: Translation Chain
‚úÖ Translation chain operational
   - Sample response: You're looking for the word "test". Let me check the dictionary for you.

Accord...

üìä SYSTEM STATUS
‚úÖ Vector Database: 9,622 entries indexed
‚úÖ Embeddings: Local (unlimited usage)
‚úÖ LLM: groq (API-based)
‚úÖ Total chunks: 9,622
üéâ All systems operational! Ready to translate! üáµüá≠
