In [6]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

print("‚úÖ All imports successful!")


‚úÖ All imports successful!


In [7]:
# Load API keys from .env file
load_dotenv()

# Verify keys are loaded
gemini_key = os.getenv('GEMINI_API_KEY')
groq_key = os.getenv('GROQ_API_KEY')

if gemini_key:
    print("‚úÖ Gemini API key loaded")
if groq_key:
    print("‚úÖ Groq API key loaded")
    
if not gemini_key and not groq_key:
    print("‚ö†Ô∏è No API keys found! Create .env file with your keys")


‚úÖ Gemini API key loaded
‚úÖ Groq API key loaded


In [8]:
# OPTION 1: Use Gemini Flash 2.0 (Recommended - Free 1500 req/day)
MODEL_CHOICE = "gemini"  # Change to "groq" if you prefer

if MODEL_CHOICE == "gemini":
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash-exp",
        google_api_key=os.getenv('GEMINI_API_KEY'),
        temperature=0.3,  # Lower = more precise translations
        convert_system_message_to_human=True
    )
    
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=os.getenv('GEMINI_API_KEY')
    )
    
    print("üöÄ Using Gemini Flash 2.0")
    print("   - Free tier: 1,500 requests/day")

elif MODEL_CHOICE == "groq":
    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        groq_api_key=os.getenv('GROQ_API_KEY'),
        temperature=0.3
    )
    
    # Use Gemini for embeddings (Groq doesn't have embedding model)
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=os.getenv('GEMINI_API_KEY')
    )
    
    print("üöÄ Using Groq Llama 3.1 8B")
    print("   - Free tier: 14,400 requests/day")

print(f"\n‚úÖ Model configured: {MODEL_CHOICE}")


üöÄ Using Gemini Flash 2.0
   - Free tier: 1,500 requests/day

‚úÖ Model configured: gemini


In [9]:
# ‚ö†Ô∏è IMPORTANT: Put your PDF dictionaries in 'dictionaries/' folder!
# File names don't matter - all PDFs will be loaded

print("üìö Loading Ilonggo dictionaries from 'dictionaries/' folder...")

# Load all PDFs from dictionaries folder
loader = PyPDFDirectoryLoader("./dictionaries/")

try:
    documents = loader.load()
    print(f"‚úÖ Loaded {len(documents)} pages from PDF dictionaries")
    
    # Show first 200 characters to verify
    if documents:
        print(f"\nüìñ Sample text from dictionary:")
        print(f"{documents[0].page_content[:200]}...")
    else:
        print("‚ö†Ô∏è No PDFs found! Add your Ilonggo dictionary PDFs to 'dictionaries/' folder")
        
except Exception as e:
    print(f"‚ùå Error loading PDFs: {e}")
    print("Make sure:")
    print("  1. 'dictionaries/' folder exists")
    print("  2. PDF files are inside it")
    print("  3. PDFs are not password-protected")


üìö Loading Ilonggo dictionaries from 'dictionaries/' folder...
‚úÖ Loaded 596 pages from PDF dictionaries

üìñ Sample text from dictionary:
English ‚Äì Hiligaynon (Ilongo)
a ( indefinite article) isa 
aback ( to be taken aback) palak 
abandon pabayaan , abandonar 
abandoned sim-ong 
abatoir ihawan 
abbreviation lip-ot 
ABC abakada 
abdomen ...


In [10]:
# Split dictionary into searchable chunks
print("‚úÇÔ∏è Splitting dictionary into chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,        # Small chunks for dictionary entries
    chunk_overlap=50,      # Overlap to avoid cutting words
    separators=["\n\n", "\n", ".", " "]  # Split on paragraphs first
)

chunks = text_splitter.split_documents(documents)

print(f"‚úÖ Created {len(chunks)} searchable chunks")
print(f"\nüìù Sample chunk:")
print(f"{chunks[0].page_content[:150]}...")


‚úÇÔ∏è Splitting dictionary into chunks...
‚úÖ Created 9622 searchable chunks

üìù Sample chunk:
English ‚Äì Hiligaynon (Ilongo)
a ( indefinite article) isa 
aback ( to be taken aback) palak 
abandon pabayaan , abandonar 
abandoned sim-ong 
abatoir ...
