# Install required frameworks

In [None]:
!pip install -U sentence_transformers llama-index llama-index-llms-ollama llama-index-embeddings-huggingface

# If you are lucky enough to be in the list of sudoer's:

In [None]:
!sudo curl -fsSL https://ollama.com/install.sh | sh
!ollama pull llama3

# But you are most likely out of sudoer's list and the kernel does not allow you to run processes in background. 
# So, I suggest you execute the following commands in your terminal

In [None]:
!wget https://github.com/ollama/ollama/releases/download/v0.3.9/ollama-linux-amd64.tgz
!tar xvfz ollama-linux-amd64.tgz
!./bin/ollama serve &
!./bin/ollama pull llama3

# Prepare the patent data

In [None]:
!mkdir -p data
!rm data/*
from bs4 import BeautifulSoup
from tqdm import tqdm

# PROD : >7M publications
# TEST : ~10K publications
epab = EPABClient(env="PROD")
# q = epab.query_inventor_name("JOURLIN, Pierre")
terms = ["medical", "device"]
q = epab.query_description(text=",".join(terms), match_all=True, ignore_case=True)
print(f"Found {q} publications containing the all following terms: {terms}")
limit = 500
tab = q.get_results(
    "epab_doc_id, title.fr, abstract, description, publication, inventor", limit=limit
)
print(f"Storing {limit} patents to disk...")
for offset in tqdm(range(limit)):
    data = tab["description.text"][offset]
    with open("data/" + tab["epab_doc_id"][offset] + ".txt", "w") as file:
        print(data, file=file)
    soup = BeautifulSoup(data, "html.parser")
    # print(soup.get_text())
#    root = ET.fromstring(text)

# 

In [None]:
from epo.tipdata.epab import EPABClient
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

documents = SimpleDirectoryReader("data").load_data()

# embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="NeuML/pubmedbert-base-embeddings-matryoshka"
)

# ollama
Settings.llm = Ollama(model="llama3", request_timeout=360.0)
print("Indexing patents can take some time...")
index = VectorStoreIndex.from_documents(
    documents,
)
print("Indexing patents completed...")
query_engine = index.as_query_engine()
while True:
    query = input("How can I help ? (answer 'bye' to quit) " + "\n>")
    if query == "bye":
        break
    response = query_engine.query(query)
    print(response)