<a href="https://colab.research.google.com/github/hongqin/AI4Health/blob/main/RAG_wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Python Tutorial: Chat with a Local Document using an Open-Source LLM on Google Colab

# Step 1: Install Required Libraries
# To get started, we'll use the `langchain` library for building language model applications and the `transformers` library for loading open-source LLMs.

# Install libraries
!pip install langchain transformers sentence_transformers chromadb beautifulsoup4 requests protobuf==3.20.3

# Step 2: Load the Open-Source Language Model
# We'll use a pre-trained model like `tiiuae/falcon-7b` or similar. Make sure to select a model suitable for your needs.

from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
def load_model():
    model_name = "tiiuae/falcon-7b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
    return model, tokenizer

model, tokenizer = load_model()

# Step 3: Fetch and Index Text from Wikipedia
# We'll scrape text from a Wikipedia page, process it, and index it for querying.

import requests
from bs4 import BeautifulSoup
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Fetch text from a Wikipedia page
def fetch_wikipedia_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = "".join([para.get_text() for para in paragraphs])
    return text

# Index the document
def index_document_from_text(text):
    # Split text into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_text(text)

    # Use embeddings for indexing
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    db = Chroma.from_texts(docs, embeddings)
    return db

# Example: Fetch and index a Wikipedia page
wiki_url = "https://en.wikipedia.org/wiki/Computer#History"
wikipedia_text = fetch_wikipedia_text(wiki_url)
db = index_document_from_text(wikipedia_text)

# Step 4: Build a Chat Interface
# Create a function to interact with your document using the LLM and the indexed data.

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Define a query-answering chain
def create_qa_chain(model, db):
    prompt_template = PromptTemplate(template="""
    Use the context below to answer the question.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """, input_variables=["context", "question"])

    retriever = db.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm=model, retriever=retriever, prompt_template=prompt_template)
    return qa_chain

qa_chain = create_qa_chain(model, db)

# Step 5: Query the Document
# Finally, input a question and get an answer based on your document.

# Ask a question
def ask_question(question):
    response = qa_chain.run(question)
    return response

question = "What is the history of computers?"
answer = ask_question(question)
print("Answer:", answer)

# Step 6: Optional Enhancements
# - **Streamlit UI:** Build a web interface for interaction.
# - **Model Fine-Tuning:** Customize the model for your specific data.
# - **Cloud Integration:** Store and index large documents using cloud storage.

# This completes the tutorial on using an open-source LLM to chat with a Wikipedia page on Google Colab.




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


ValidationError: 2 validation errors for LLMChain
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value=FalconForCausalLM(
  (tra...res=65024, bias=False)
), input_type=FalconForCausalLM]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value=FalconForCausalLM(
  (tra...res=65024, bias=False)
), input_type=FalconForCausalLM]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of