In [1]:
from typing import Optional

In [2]:
%run Firebase.ipynb
%run Bucket.ipynb

In [None]:
from sklearn.model_selection import train_test_split
import os
from dotenv import load_dotenv
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

load_dotenv()

firebase = Firebase()
llama_index_documents = firebase.get_all_documents(
  limit=50
)
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_index_documents, show_progress=True)
train_nodes, test_nodes = train_test_split(nodes, test_size=0.2)
#####################################################
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        logprobs=False,
        default_headers={}
      ), nodes=train_nodes
)
test_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        logprobs=False,
        default_headers={}
      ), nodes=test_nodes
)

In [None]:
bucket = Bucket('rag-outputs-pdf')
bucket.upload_object('qa_finetune_dataset.json', 'evaluation/qa_finetune_dataset.json')

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-m3",
    model_output_path="test_model",
    val_dataset=test_dataset,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model.__class__

In [13]:
import pickle   

with open('models/finetuned_model.pkl', 'wb') as f:
    pickle.dump(embed_model, f)

In [None]:
bucket.upload_object('models/finetuned_model.pkl', 'models/finetuned_model.pkl')

In [None]:
import pickle

with open('../models/finetuned_model.pkl', 'rb') as f:
    embedding_model = pickle.load(f)