In [None]:
import os
import json
import glob
import pickle
import openparse
from tqdm import tqdm
from huggingface_hub import login
import google.generativeai as genai
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from transformers import BitsAndBytesConfig
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.llms.chatml_utils import messages_to_prompt, completion_to_prompt

In [None]:
HF_CACHE_DIR = "../models/hf"
os.environ['HF_HOME'] = HF_CACHE_DIR

TIKTOKEN_CACHE_DIR = "../models/tiktoken"
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
# assert os.path.exists(os.path.join(TIKTOKEN_CACHE_DIR, "9b5ad71b2ce5302211f9c61530b329a4922fc6a4"))

In [None]:
# Read API tokens (SHOULD BE CREATED BY USER)
with open('../reqs/tokens.json', 'r') as file:
    tokens = json.load(file)

HF_ACCESS_TOKEN = tokens['HF_ACCESS_TOKEN'][0]
GOOGLE_API_KEY = tokens['GOOGLE_API_KEY'][0]

login(token=HF_ACCESS_TOKEN)

# Test: Compare LLMs (with different settings)

In [2]:
# # API Test
# genai.configure(api_key=GOOGLE_API_KEY)
# llm = genai.GenerativeModel("gemini-1.0-pro")
# response = llm.generate_content("Write a story about a magic backpack.")
# print(response.text)

In [3]:
llm_gemini_100 = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 1,
)

llm_gemini_75 = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.75,
)

llm_gemini_50 = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.5,
)

llm_gemini_25 = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.25,
)

llm_gemini_0 = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.,
)

llms = {
    "gemini_100": llm_gemini_100,
    "gemini_75": llm_gemini_75,
    "gemini_50": llm_gemini_50,
    "gemini_25": llm_gemini_25,
    "gemini_0": llm_gemini_0,
}

In [None]:
# file = glob.glob("../data/finetune/docs/test/*.pdf")[0]
# pdf = openparse.Pdf(file)

# parser = openparse.DocumentParser(
#     table_args = {
#         # 1. PyMuPDF has some table detection functionality like OCR. (FAST but still GOOD)
#         "parsing_algorithm": "pymupdf",

#         # # 2. unitable is a transformers based approach with state-of-the-art performance. (SLOW)
#         # "parsing_algorithm": "unitable",
#         # "min_table_confidence": 0.8,
#     },
# )

# _nodes = parser.parse(file, ocr=True)
# nodes = _nodes.to_llama_index_nodes()
# with open('../data/finetune/docs/test/nodes.pkl', 'wb') as _file: pickle.dump(nodes, _file)

# print(len(nodes))
# # pdf.display_with_bboxes(_nodes.nodes)
# pdf.export_with_bboxes(_nodes.nodes, output_pdf=file.replace(".pdf", "_annotated.pdf"))

In [None]:
with open(f'../data/finetune/docs/test/nodes.pkl', 'rb') as file:
    nodes = pickle.load(file)

for llm in llms:
    # It first loads previous json files and continue from there
    test_dataset = generate_qa_embedding_pairs(
        llm = llms[llm],
        nodes = nodes,
        num_questions_per_chunk = 2,
        output_path = f"../data/finetune/datasets/test/test_{llm}.json",
    )

In [6]:
# for node in _nodes.nodes:
#     display(node)
#     print("-------------------")

In [7]:
# class GoogleEmbeddings:
#     def __init__(self, model, api_key):
#         genai.configure(api_key=api_key)
#         self.model = TextEmbeddingModel.from_pretrained(model)

#     def embed_many(self, texts):
#         res = []
#         for i in range(0, len(texts), self.batch_size):
#             batch_texts = texts[i : i + self.batch_size]
#             api_resp = genai.embed_content(
#                 model = self.model,
#                 content = batch_texts,
#                 task_type = "SEMANTIC_SIMILARITY",
#             )
#             batch_res = [val.embedding for val in api_resp.data]
#             res.extend(batch_res)
#         return res

# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# def get_node_similarities(nodes, embedding_client):
#     # get the similarity of each node with the node that precedes it
#     embeddings = embedding_client.embed_many([node.text for node in nodes])
#     similarities = []
#     for i in range(1, len(embeddings)):
#         similarities.append(cosine_similarity(embeddings[i - 1], embeddings[i]))

#     similarities = [round(sim, 2) for sim in similarities]
#     return [0] + similarities


# class MinimalIngestionPipeline(openparse.processing.IngestionPipeline):
#     def __init__(self):
#         self.transformations = [
#             # combines bullets and weird formatting
#             openparse.processing.CombineNodesSpatially(
#                 x_error_margin=10,
#                 y_error_margin=2,
#                 criteria="both_small",
#             ),
#             openparse.processing.CombineHeadingsWithClosestText(),
#             openparse.processing.CombineBullets(),
#             openparse.processing.RemoveMetadataElements(),
#             openparse.processing.RemoveNodesBelowNTokens(min_tokens=10),
#         ]

In [8]:
# embedding_client = GoogleEmbeddings(
#     api_key = GOOGLE_API_KEY,
#     model = "models/text-embedding-004",
# )

# # perform very basic parsing to clean up the document
# doc = openparse.Pdf(file=file)
# parser = openparse.DocumentParser(
#     table_args = {
#         # 1. PyMuPDF has some table detection functionality like OCR. (FAST but still GOOD)
#         "parsing_algorithm": "pymupdf",

#         # # 2. unitable is a transformers based approach with state-of-the-art performance. (SLOW)
#         # "parsing_algorithm": "unitable",
#         # "min_table_confidence": 0.8,
#     },
#     processing_pipeline = MinimalIngestionPipeline(),
# )
# parsed_content = parser.parse(file, ocr=True)

# # annotate the document with similarity scores
# annotations = get_node_similarities(parsed_content.nodes, embedding_client)
# doc.display_with_bboxes(
#     parsed_content.nodes, annotations=annotations, page_nums=[2, 3, 4]
# )