In [None]:
# https://github.com/run-llama/llama_parse/blob/main/examples/demo_advanced.ipynb
# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py

import pickle
import arxiv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_parse import LlamaParse

REFRESH_DOCUMENTS = False

llm_model_name = "openai-community/gpt2"
# llm_model_name = "mistralai/Mistral-7B-v0.1"
# llm_model_name = "mistralai/Mistral-7B-Instruct-v0.2"

embed_model_name = "BAAI/bge-base-en-v1.5"
# embed_model_name"BAAI/bge-large-en-v1.5"

reranker_model_name = "BAAI/bge-reranker-base"
# reranker_model_name = "BAAI/bge-reranker-large"

In [None]:
embed = HuggingFaceEmbedding(model_name=embed_model_name)

embedding = embed.get_text_embedding("Hello World!")
len(embedding)

In [None]:
# TODO
llm = HuggingFaceLLM(
    context_window=1024,
    max_new_tokens=256,
    generate_kwargs={"do_sample": False},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=llm_model_name,
    model_name=llm_model_name,
    device_map="auto",
    tokenizer_kwargs={"max_length": 1024},
    model_kwargs={"torch_dtype": torch.float16}
)

In [None]:
Settings.embed_model = embed
Settings.llm = llm
Settings.chunk_size = 512
Settings.chunk_overlap = 128

In [None]:
if REFRESH_DOCUMENTS:
    # download pdfs from arxiv
    client = arxiv.Client()
    search = arxiv.Search(
      query = "machine learning",
      max_results = 10,
      sort_by = arxiv.SortCriterion.SubmittedDate
    )
    for r in client.results(search):
        r.download_pdf(dirpath="./documents")

    # parse pdfs using llama-parse
    parser = LlamaParse(
        api_key="llx-enXNU9nW03mL7suZOnHjTBm3oEMQZrW1bfrKoV7pWA486uOJ",
        result_type="markdown", # "markdown" and "text" are available
        verbose=True
    )
    file_extractor = {".pdf": parser}
    reader = SimpleDirectoryReader("./documents", file_extractor=file_extractor)
    documents = reader.load_data(num_workers=4)

    # save parsed documents to disk
    with open('output/documents.pkl', 'wb') as f:
        pickle.dump(documents, f)
else:
    with open('output/documents.pkl', 'rb') as f:
        documents = pickle.load(f)

In [None]:
len(documents)

In [None]:
print(documents[0].text[:1000] + '...')

In [None]:
index = VectorStoreIndex.from_documents(documents)