In [None]:
# Install required packages
%pip install chromadb langchain datasets sentence-transformers groq gradio huggingface_hub
%pip install -U langchain-community

# Import libraries
from huggingface_hub import login, hf_hub_download
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import requests
import gradio as gr
import os

# Login to Hugging Face
login("your-hf-token-here")

# Load Harry Potter books
print("📚 Loading Harry Potter books from text files...")

book_files = {
    'train': [
        "train/Harry Potter-Book 1-The Sorcerers Stone.txt",
        "train/Harry Potter-Book 2-The Chamber of Secrets.txt",
        "train/Harry Potter-Book 3-The Goblet of Fire.txt",
        "train/Harry Potter-Book 4-The Prisoner of Azkaban.txt",
        "train/Harry Potter-Book 5-The Order of the Phoenix.txt"
    ],
    'validation': [
        "validation/Harry Potter-Book 6-The Half-Blood Prince.txt"
    ],
    'test': [
        "test/Harry Potter-Book 7-The Deathly Hallows.txt"
    ]
}

texts = []
book_contents = {}

try:
    for split, files in book_files.items():
        print(f"\n📖 Loading {split} books...")

        for file_path in files:
            book_name = file_path.split('/')[-1].replace('.txt', '')
            print(f"  ⬇️ Downloading: {book_name}")

            try:
                local_file = hf_hub_download(
                    repo_id="WutYee/HarryPotter_books_1to7",
                    filename=file_path,
                    repo_type="dataset"
                )

                with open(local_file, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()

                book_contents[book_name] = content
                texts.append(content)

                print(f"    ✅ Loaded {book_name}")
                print(f"    📄 Total characters: {len(content):,}")

            except Exception as e:
                print(f"    ❌ Failed to load {book_name}: {e}")

    print(f"\n🎉 Successfully loaded Harry Potter dataset!")
    print(f"📊 Total books: {len(book_contents)}")
    print(f"📊 Total text entries: {len(texts)}")

except Exception as main_error:
    print(f"❌ Main loading process failed: {main_error}")

    # Fallback: try loading just one book
    print("\n🔄 Trying to load just one book as a test...")
    try:
        test_file = hf_hub_download(
            repo_id="WutYee/HarryPotter_books_1to7",
            filename="train/Harry Potter-Book 1-The Sorcerers Stone.txt",
            repo_type="dataset"
        )

        with open(test_file, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        texts = [content]
        book_contents = {"Harry Potter-Book 1-The Sorcerers Stone": content}

        print(f"✅ Fallback successful! Loaded 1 book with {len(content):,} characters")

    except Exception as fallback_error:
        print(f"❌ Fallback also failed: {fallback_error}")
        texts = []

# Only proceed if we have texts
if not texts:
    raise ValueError("❌ No texts were loaded! Please check your Hugging Face authentication and internet connection.")

# Split texts into chunks
print("🔨 Splitting books into chunks...")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.create_documents(texts)

print(f"✅ Total chunks created: {len(docs)}")
if docs:
    print(f"🧩 Sample chunk:\n{docs[0].page_content[:300]}...")

# Load embedding model
print("📥 Loading local embedding model (all-MiniLM-L6-v2)...")
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create vector store
print("🧠 Creating Chroma vector store...")
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding, persist_directory="/content/chroma_db")
vectorstore.persist()

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
print("✅ Vectorstore ready and persisted at /content/chroma_db")

# Fixed Groq API function with better error handling
def groq_chat(prompt, model="llama-3.1-8b-instant"):
    headers = {
        "Authorization": "your-api-key-here",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 1000,
        "temperature": 0.7
    }

    try:
        response = requests.post("https://api.groq.com/openai/v1/chat/completions",
                               headers=headers, json=payload, timeout=30)

        if response.status_code != 200:
            error_msg = f"Groq API Error {response.status_code}: {response.text}"
            print(f"❌ {error_msg}")
            return f"Sorry, I encountered an API error: {error_msg}"

        return response.json()["choices"][0]["message"]["content"]

    except requests.exceptions.Timeout:
        return "Sorry, the request timed out. Please try again."
    except requests.exceptions.RequestException as e:
        return f"Sorry, there was a network error: {str(e)}"
    except KeyError as e:
        return f"Sorry, unexpected response format from API: {str(e)}"
    except Exception as e:
        return f"Sorry, an unexpected error occurred: {str(e)}"

# Fixed RAG pipeline with better error handling
def rag_pipeline(query):
    try:
        # Validate input
        if not query or not query.strip():
            return "Please enter a valid question about Harry Potter."

        # Retrieve relevant documents
        context_docs = retriever.get_relevant_documents(query)

        if not context_docs:
            return "Sorry, I couldn't find relevant information in the Harry Potter books for your query."

        # Create context from retrieved documents
        context = "\n\n".join([doc.page_content for doc in context_docs])

        # Create prompt
        prompt = f"""You are a Harry Potter expert. Use the following context to answer the user's question accurately and comprehensively.

Context:
{context}

Question: {query}

Answer (be specific and reference the books when possible):"""

        # Get response from Groq
        response = groq_chat(prompt)
        return response

    except Exception as e:
        error_msg = f"An error occurred in the RAG pipeline: {str(e)}"
        print(f"❌ {error_msg}")
        return f"Sorry, {error_msg}"

# Launch Gradio interface
print("🚀 Launching chatbot interface...")

# Create the interface with better configuration
interface = gr.Interface(
    fn=rag_pipeline,
    inputs=gr.Textbox(
        label="Ask a question about Harry Potter",
        placeholder="What is Harry's patronus? Who is Sirius Black? Tell me about Hogwarts...",
        lines=2
    ),
    outputs=gr.Textbox(
        label="Answer",
        lines=10
    ),
    title="⚡ Harry Potter RAG Chatbot",
    description="Ask questions about the Harry Potter books! This system uses RAG (Retrieval-Augmented Generation) with local embeddings and Groq's LLM.",
    examples=[
        "What is Harry Potter's patronus?",
        "Who is Sirius Black?",
        "Tell me about the Triwizard Tournament",
        "What are the Deathly Hallows?",
        "Who is Professor Snape?"
    ],
    theme=gr.themes.Soft(),
    allow_flagging="never"
)

interface.launch(debug=True, share=True)