In [None]:
from llama_index.llms import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/blob/main/mistral-7b-instruct-v0.1.Q5_K_M.gguf"
path_model = "/path_to_the_model_mistral/mistral-7b-instruct-v0.1.Q5_K_M.gguf"


llm = LlamaCPP(model_url=model_url,
                model_path=path_model,
                model_kwargs={"n_gpu_layers": -1},
                verbose=True,
                temperature=0.2)

In [None]:
from llama_index import StorageContext, load_index_from_storage
from llama_index import Prompt

path_vector_db = "<vector_database>"
storage_context = StorageContext.from_defaults(persist_dir=path_vector_db)
index = load_index_from_storage(storage_context)

# Création de l'engine de discussion
## index.as_chat_engine() si tu veux un chatbot
## index.as_query_engine() si tu veux qu'il réponde à tes questions un peu mieux mais sans interactions

In [None]:
# Le paramètre similarity_top_k donne le nombre de chunk de texte de ta base de données
# Qui vont être utilisés ! 
template = (
    "Voici du context pour répondre à une question que je vais te poser \n"
    "{context_str} \n"
    "Réponds à {query_str} à partir des informations précédentes \n"
)

qa_template = Prompt(template)

# chatbot = index.as_chat_engine()
QA_engine = index.as_query_engine(verbose=True,
                                  text_qa_template=qa_template,
                                  similarity_top_k=3)

In [None]:
# chat = chatbot.chat("bonjour j'ai une question")
response = QA_engine.query("bonjour j'ai une question")

# Maintenant un script pour finetune ton embedding si jamais

In [None]:
# Quantité de données générées
N_TEXTS = 100

# Si ça prend trop de temps réduit ça, si tu veux de meilleurs perf augmente
N_EPOCHS = 10


In [None]:
from question_answering_dataset_generator import random_texts, questions_generation
from sentence_transformers import InputExample
from llama_index import StorageContext, load_index_from_storage

from torch.utils.data import DataLoader
import random

import joblib
from sentence_transformers import losses, SentenceTransformer
import torch
from tqdm import tqdm

GPU_DEVICE = "CUDA" if torch.cuda.is_available else "mps" if torch.mps.is_available else "cpu"

path_vector_db = "/Users/jean-baptistechaudron/Documents/Uruk/<uruk_maquette_vicuna>"

storage_context = StorageContext.from_defaults(persist_dir=path_vector_db)
index = load_index_from_storage(storage_context)

answers = random_texts(index, n_texts=N_TEXTS)
dataset = questions_generation(answers, llm)

all_data = list(dataset.items())
train_index = random.sample(list(range(len(all_data))), k = int(0.7*len(all_data)))
train_data = [all_data[i] for i in train_index]
eval_data = [all_data[i] for i in range(len(all_data)) if not i in train_index]

joblib.dump(train_data,"training_qa_dataset.joblib")
joblib.dump(eval_data,"evaluation_qa_dataset.joblib")

train_examples = []

for (q,a) in train_data:
    train_examples.append(InputExample(texts=[q,a]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

embedding_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1-finetuned"
model = SentenceTransformer(embedding_model)
model.to(GPU_DEVICE)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=N_EPOCHS)
model.save()

# Pour évaluer à quel point le nouveau modèle est performant par rapport à l'autre

In [None]:
result = []

model.eval()
for (q,a) in tqdm(eval_data):
    with torch.no_grad():
        tokenized_q, tokenized_a = model.tokenize(q), model.tokenize(a)
        tokenized_q = {key : item.to(GPU_DEVICE) for key, item in tokenized_q.items()}
        tokenized_a = {key : item.to(GPU_DEVICE) for key, item in tokenized_a.items()}
        embedding_q, embedding_a = model(tokenized_q), model(tokenized_a)
        loss = torch.dot(embedding_q["sentence_embedding"].mean(0),embedding_a["sentence_embedding"].mean(0))
        print(loss)
        result.append(loss.detach().tolist())


vanilla_embedding_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
model_vanilla = SentenceTransformer(embedding_model)
model_vanilla.eval()
results_vanilla = []
for (q,a) in tqdm(eval_data):
    with torch.no_grad():
        tokenized_q, tokenized_a = model_vanilla.tokenize(q), model_vanilla.tokenize(a)
        tokenized_q = {key : item.to(GPU_DEVICE) for key, item in tokenized_q.items()}
        tokenized_a = {key : item.to(GPU_DEVICE) for key, item in tokenized_a.items()}
        embedding_q, embedding_a = model_vanilla(tokenized_q), model_vanilla(tokenized_a)
        loss = torch.dot(embedding_q["sentence_embedding"].mean(0),embedding_a["sentence_embedding"].mean(0))
        print(loss)
        results_vanilla.append(loss.detach().tolist())