In [None]:
%run Firebase.ipynb
%run Bucket.ipynb

In [None]:
from sklearn.model_selection import train_test_split
import os
from dotenv import load_dotenv
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

load_dotenv()

firebase = Firebase()
llama_index_documents = firebase.get_all_documents(
    limit=50
)
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(
    llama_index_documents, 
    show_progress=True
)
#train_nodes, test_nodes = train_test_split(nodes, test_size=0.2)

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

training_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        logprobs=False,
        default_headers={}
      ),
    nodes=nodes
)


In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from config import settings
import torch

os.environ["HF_TOKEN"] = settings.hf_token
mps_device = torch.device("mps")

finetune_engine = SentenceTransformersFinetuneEngine(
    training_dataset,
    model_id="BAAI/bge-m3",
    model_output_path="test_model",
    device=mps_device,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [7]:
import pickle   

with open('models/finetuned_model.pkl', 'wb') as f:
    pickle.dump(embed_model, f)

In [None]:
bucket = Bucket(settings.aws_s3_output_bucket)
bucket.upload_object('models/finetuned_model.pkl', 'models/finetuned_model.pkl')