In [36]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from lavague.contexts.openai import OpenaiContext

from llama_index.multi_modal_llms.openai import OpenAIMultiModal


embed_model = GeminiEmbedding(model_name="models/text-embedding-004")
llm = Gemini(model_name="models/gemini-1.5-flash-latest")

context = OpenaiContext(llm="gpt-4o")

context.llm = llm
context.embedding = embed_model

mm_llm = OpenAIMultiModal(model="gpt-4o", temperature=0.0)


In [37]:
import requests
url = "https://huggingface.co/tiiuae/falcon-11B"
instruction = "How was falcon-11B trained?"

#url = "https://fr.wikipedia.org/wiki/Yann_Le_Cun"
# instruction = "where does Yann LeCun work ?"

In [38]:
from llama_index.core.tools import FunctionTool
import trafilatura
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.agent import ReActAgent


html = requests.get(url).content

#write html in a file
with open('example.html', 'wb') as f:
    f.write(html)

state = {
    'html_file': 'example.html',
}


def extract_text_content(instruction, html_file):
    """
    Extract the text content of the HTML page
    {
        "user": "Use the content of the HTML page to answer the question 'How was falcon-11B trained?'",
        "agent": "I need to use the 'extract_text_content' tool to get information about how Falcon-11B was trained.",
        "action": "extract_text_content",
        "action_input": {
            "instruction": "How was Falcon-11B trained?",
            "html_file": "example.html"
        },
    },
    {
        "user": "Use the content of the HTML page to answer the question 'What are the main types of renewable energy?'",
        "agent": "I need to use the 'extract_text_content' tool to get information about the main types of renewable energy.",
        "action": "extract_text_content",
        "action_input": {
            "instruction": "How was Falcon-11B trained?",
            "html_file": "example.html"
        },
    },
    {
        "user": "Use the content of Yann LeCun's Wikipedia page to make a summary of his life.",
        "agent": "I need to use the 'extract_text_content' tool to get information from Yann LeCun's Wikipedia page to make a summary of his life.",
        "action": "extract_text_content",
        "action_input": {
            "instruction": "How was Falcon-11B trained?",
            "html_file": "example.html"
        },
    }
    """
    with open(html_file, 'rb') as f:
        html = f.read()
        os.remove(html_file)
    page_content = trafilatura.extract(html)
    documents = [Document(text=page_content)]
    index = VectorStoreIndex.from_documents(documents)
    query_engine = index.as_query_engine(llm)
    output = query_engine.query(instruction).response
    return output

extract_tool = FunctionTool.from_defaults(fn=extract_text_content, return_direct=True)


In [39]:
agent = ReActAgent.from_tools([extract_tool], llm=llm, verbose=True)
answer = agent.chat("Extract the code snippet to use Falcon11-b")

[1;3;38;5;200mThought: I need to use the 'extract_text_content' tool to get the code snippet to use Falcon11-b.
Action: extract_text_content
Action Input: {'instruction': 'Extract the code snippet to use Falcon11-b', 'html_file': 'example.html'}
[0m[1;3;34mObservation: ```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-11B"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
sequences = pipeline(
    "Can you explain the concepts of Quantum Computing?",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")
``` 

[0m