In [19]:
!pip install -q langchain langchain-community langchain-openai
!pip install -q chromadb sentence-transformers
!pip install -q pypdf
!pip install -q gradio



In [23]:
import os
import tempfile
from typing import List
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

# Create sample documents immediately for testing
sample_doc1 = """
Artificial Intelligence and Machine Learning

Artificial Intelligence (AI) is a branch of computer science that aims to create
intelligent machines that can perform tasks that typically require human intelligence.
Machine Learning (ML) is a subset of AI that enables computers to learn and improve
from experience without being explicitly programmed.

Key concepts in machine learning include:
- Supervised Learning: Learning with labeled data
- Unsupervised Learning: Finding patterns in unlabeled data
- Reinforcement Learning: Learning through interaction with environment
- Deep Learning: Using neural networks with multiple layers

Applications of AI include natural language processing, computer vision,
robotics, and recommendation systems.
"""

sample_doc2 = """
Python Programming Best Practices

Python is a high-level programming language known for its simplicity and readability.
Here are some best practices for Python development:

1. Code Style:
   - Follow PEP 8 style guidelines
   - Use meaningful variable names
   - Write docstrings for functions and classes

2. Error Handling:
   - Use try-except blocks appropriately
   - Handle specific exceptions
   - Provide meaningful error messages

3. Code Organization:
   - Use modules and packages
   - Separate concerns into different functions
   - Keep functions small and focused

4. Testing:
   - Write unit tests
   - Use pytest or unittest frameworks
   - Aim for high test coverage
"""

# Save sample documents
with open("ai_ml_guide.txt", "w") as f:
    f.write(sample_doc1)

with open("python_best_practices.txt", "w") as f:
    f.write(sample_doc2)

print(" Sample documents created for testing!")


Libraries imported successfully!
 Sample documents created for testing!


In [24]:
class SimpleRAG:
    def __init__(self):
        # Initialize embeddings
        print("Initializing embeddings...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )

        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
        )

        self.vectorstore = None
        self.documents = []
        print(" SimpleRAG initialized!")

    def load_documents(self, file_paths):
        """Load and process documents"""
        all_docs = []

        for file_path in file_paths:
            print(f"Loading: {file_path}")
            if file_path.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
            else:
                loader = TextLoader(file_path, encoding='utf-8')

            docs = loader.load()
            all_docs.extend(docs)

        # Split into chunks
        texts = self.text_splitter.split_documents(all_docs)
        self.documents = texts

        # Create vector store
        print("Creating vector database...")
        self.vectorstore = Chroma.from_documents(
            documents=texts,
            embedding=self.embeddings
        )

        return f"Loaded {len(texts)} chunks from {len(file_paths)} documents"

    def search_documents(self, query, k=3):
        """Search for relevant documents"""
        if not self.vectorstore:
            return "Please load documents first!"

        docs = self.vectorstore.similarity_search(query, k=k)
        return docs

    def answer_question(self, question):
        """Answer question based on documents"""
        if not self.vectorstore:
            return "Please load documents first!"

        # Get relevant documents
        relevant_docs = self.search_documents(question, k=2)

        # Create simple answer by extracting relevant text
        context = "\n\n".join([doc.page_content for doc in relevant_docs])

        # Simple keyword-based answer extraction
        sentences = context.split('.')
        question_words = set(question.lower().split())

        # Find most relevant sentences
        scored_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 20:  # Skip very short sentences
                sentence_words = set(sentence.lower().split())
                score = len(question_words.intersection(sentence_words))
                if score > 0:
                    scored_sentences.append((score, sentence))

        # Sort by relevance and take top sentences
        scored_sentences.sort(reverse=True)

        if scored_sentences:
            answer = ". ".join([sent[1] for sent in scored_sentences[:2]]) + "."
        else:
            answer = relevant_docs[0].page_content[:300] + "..." if relevant_docs else "No relevant information found."

        # Add source information
        sources = []
        for i, doc in enumerate(relevant_docs):
            sources.append(f"Source {i+1}: {doc.page_content[:100]}...")

        return {
            "answer": answer,
            "sources": sources
        }

# Initialize the system
rag_system = SimpleRAG()

# Load sample documents automatically
print("Loading sample documents...")
result = rag_system.load_documents(["ai_ml_guide.txt", "python_best_practices.txt"])
print(result)


Initializing embeddings...
 SimpleRAG initialized!
Loading sample documents...
Loading: ai_ml_guide.txt
Loading: python_best_practices.txt
Creating vector database...
Loaded 4 chunks from 2 documents


In [25]:
test_questions = [
    "What is machine learning?",
    "What are Python best practices?",
    "Tell me about supervised learning",
    "How should I organize Python code?"
]

print(" Testing the RAG system:")
print("=" * 50)

for question in test_questions:
    print(f"\n Question: {question}")
    result = rag_system.answer_question(question)
    print(f"Answer: {result['answer']}")
    print(" Sources:")
    for source in result['sources']:
        print(f"   - {source}")
    print("-" * 30)


def process_question(question):
    """Process user question"""
    if not question.strip():
        return "Please enter a question!"

    result = rag_system.answer_question(question)

    # Format response
    response = f"**Answer:** {result['answer']}\n\n"
    response += "**Sources:**\n"
    for source in result['sources']:
        response += f"• {source}\n"

    return response

# Create simple interface
interface = gr.Interface(
    fn=process_question,
    inputs=gr.Textbox(
        label="Ask a question about AI/ML or Python",
        placeholder="e.g., What is machine learning?",
        lines=2
    ),
    outputs=gr.Markdown(label="Answer with Sources"),
    title="🧠 Personal Knowledge Assistant",
    description="Ask questions about the loaded documents (AI/ML and Python best practices)",
    examples=[
        "What is machine learning?",
        "What are Python best practices?",
        "Tell me about supervised learning",
        "How should I organize Python code?",
        "What is deep learning?",
        "What are the types of machine learning?"
    ]
)

print("Gradio interface created!")


🧪 Testing the RAG system:

 Question: What is machine learning?
Answer: Machine Learning (ML) is a subset of AI that enables computers to learn and improve
from experience without being explicitly programmed. Machine Learning (ML) is a subset of AI that enables computers to learn and improve
from experience without being explicitly programmed.
 Sources:
   - Source 1: Artificial Intelligence and Machine Learning

Artificial Intelligence (AI) is a branch of computer s...
   - Source 2: Artificial Intelligence and Machine Learning

Artificial Intelligence (AI) is a branch of computer s...
------------------------------

 Question: What are Python best practices?
Answer: Here are some best practices for Python development:

1. Here are some best practices for Python development:

1.
 Sources:
   - Source 1: Python Programming Best Practices

Python is a high-level programming language known for its simplic...
   - Source 2: Python Programming Best Practices

Python is a high-level program

In [26]:
print("Launching Knowledge Assistant...")
print(" Pre-loaded documents:")
print("- AI and Machine Learning guide")
print("- Python Programming Best Practices")
print("\n Try asking questions from the examples!")

# Launch the interface
interface.launch(
    share=True,
    debug=False,  # Set to False to avoid infinite loop
    show_error=True
)

print(" App launched successfully!")


Launching Knowledge Assistant...
 Pre-loaded documents:
- AI and Machine Learning guide
- Python Programming Best Practices

 Try asking questions from the examples!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://359d83160c6da60b29.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


 App launched successfully!
