In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")

## Load the csv into a DF

In [2]:
import pandas as pd

csv_html = 'data/vejledninger_html_2-11-2023.csv'

df_html = pd.read_csv(csv_html, names=['Document Title', 'Document Content'], header=0)
df_html.head()

Unnamed: 0,Document Title,Document Content
0,Vejledning om beskæftigelseskravet for ret til...,"<div class=""document-content"" id=""restylingRoo..."
1,Vejledning om lungesygdomme,"<div class=""document-content"" id=""restylingRoo..."
2,At-vejledning 13.0.1-1 om undervisningspligtig...,"<div class=""document-content"" id=""restylingRoo..."
3,Vejledning om behandlingsudgifter,"<div class=""document-content"" id=""restylingRoo..."
4,Arbejde med nanomaterialer,"<div class=""document-content"" id=""restylingRoo..."


In [3]:
csv_tekst = 'data/vejledninger_tekst_2-11-2023.csv'

df_tekst = pd.read_csv(csv_tekst, names=['Document Title', 'Document Content'], header=0)
#Drop first row as it simply contains the strings "Key" and "value"
df_tekst.head()

Unnamed: 0,Document Title,Document Content
0,Vejledning om beskæftigelseskravet for ret til...,Vejledning om beskæftigelseskravet for ret til...
1,Vejledning om lungesygdomme,Vejledning om lungesygdomme\nKapitel 7 i Vejle...
2,At-vejledning 13.0.1-1 om undervisningspligtig...,At-vejledning 13.0.1-1 om undervisningspligtig...
3,Vejledning om behandlingsudgifter,Vejledning om behandlingsudgifter\nIndledning\...
4,Arbejde med nanomaterialer,Arbejde med nanomaterialer\nDe vigtigste regle...


## Load CSV into LlamaIndex document

**First loading from DF into a dict**

In [4]:
document_dict = df_tekst.set_index('Document Title')['Document Content'].to_dict()

**Then transforming the dict into a LlamaIndex document**

Containing the text, and the document title as meta-data

In [6]:
from llama_index import Document
documents = [Document(text=content, metadata={"file_name": name}) for name, content in document_dict.items()]



In [7]:
print(f'Number of documents: {len(documents)}')

Number of documents: 329


## Chuck documents into 'nodes'

In [9]:
from llama_index.node_parser import SimpleNodeParser

node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)

Parsing documents into nodes:   0%|          | 0/329 [00:00<?, ?it/s]

In [10]:
#33.418 at chuck size 256
#14.237 at chuck size 512
print(f'Number of chunks (nodes): {len(nodes)}')

Number of chunks (nodes): 14237


In [11]:
print(nodes[5].text)

Der vil normalt ikke være tvivl om, hvorvidt der er tale om et lønmodtagerforhold. For nærmere gennemgang af praksis vedrørende afgrænsningen mellem lønmodtagere og selvstændige henvises til vejledning om sygedagpengeforsikringerne i sygedagpengelovens §§ 45 og 55.

2.1.2.Opgørelse af 160 timer i de seneste 4 afsluttede kalendermåneder
De seneste 4 afsluttede kalendermåneder regnes fra fraværets begyndelse. Det betyder, at en lønmodtager, der påbegynder en fraværsperiode f.eks. den 18. april 2022, skal have opgjort beskæftigelseskravet for ret til barselsdagpenge fra Udbetaling Danmark ud fra kalendermånederne december 2021 og januar, februar og marts 2022. I disse 4 kalendermåneder skal der være mindst 160 indberettede timer i indkomstregistret, og de skal være fordelt således, at der i 3 af kalendermånederne er mindst 40 timer i hver måned.

Det følger af lovens § 27, stk. 2, at der ved opgørelse af de 160 timer kan medregnes timer, hvor lønmodtageren har haft lønarbejde, har drevet 

# Generate synthetic Questions

In [17]:
from llama_index.finetuning import generate_qa_embedding_pairs

In [None]:
from llama_index.prompts import PromptTemplate


# Define your custom prompt template in Danish
qa_sagsbehandler_tmlp = (
""" Nedenfor er et uddrag (kontekst) fra en længere tekst:
---------------------
{context_str}
---------------------
Givet ovenstående uddrag (kontekst) og ingen forudgående viden, er din opgave at generere spørgsmål til teksten.
Du er en erfaren sagsbehandler, og din opgave er at stille præcis {num_questions_per_chunk} spørgsmål til teksten.
Spørgsmålene skal være af forskellig karakter og dække teksten bredt, men stilles i et sprog som en borger uden juridisk ekspertise kan forstå.
Svaret til spørgsmålet, skal kunne findes i ovenstående uddrag (kontekst).
"""
)

qa_sagsbehandler_tmlp = PromptTemplate(qa_sagsbehandler_tmlp)

train_dataset = generate_qa_embedding_pairs(qa_generate_prompt_tmpl=qa_sagsbehandler_tmlp, nodes=nodes[0:300], num_questions_per_chunk=2)
test_dataset = generate_qa_embedding_pairs(qa_generate_prompt_tmpl=qa_sagsbehandler_tmlp, nodes=nodes[300:400], num_questions_per_chunk=2)


**Save dataset**

In [76]:
train_dataset.save_json("train_dataset.json")
test_dataset.save_json("val_dataset.json")

In [19]:
from llama_index.finetuning import EmbeddingQAFinetuneDataset
# Load the datasets from the avove json files
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
test_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

## Evaluate performance (using Llamainde)


In [None]:
from llama_index.evaluation import RetrieverEvaluator
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

vector_index = VectorStoreIndex(nodes, service_context=service_context)
retriever = vector_index.as_retriever(similarity_top_k=2)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

## Evaluate performance


In [20]:
def evaluate(dataset, embed_model, top_k=5, verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [21]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [22]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model_hf = HuggingFaceEmbedding(
    model_name='intfloat/multilingual-e5-small',
    normalize=True,
    #Instruction to prepend query
    query_instruction='query:',
    #Instruction to prepend text
    text_instruction='passage:'
)

In [23]:
e5_small_eval = evaluate(test_dataset, embed_model=embed_model_hf, top_k=1, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [24]:
e5_small_eval_df = pd.DataFrame(e5_small_eval)
e5_small_eval_df.head()

Unnamed: 0,is_hit,retrieved,expected,query
0,True,[03c49ab1-609f-4fe8-bdd3-62ab0c4182fc],03c49ab1-609f-4fe8-bdd3-62ab0c4182fc,d61ae5cf-c632-4bee-9b55-489bfc4d5c1d
1,True,[03c49ab1-609f-4fe8-bdd3-62ab0c4182fc],03c49ab1-609f-4fe8-bdd3-62ab0c4182fc,3d726de4-9a35-4f9c-a537-47333a94d967
2,True,[4d905db2-bfed-4f81-a700-61fb69a35241],4d905db2-bfed-4f81-a700-61fb69a35241,5c9730a0-60ef-42ad-aa05-a29b9aeb1c8a
3,True,[4d905db2-bfed-4f81-a700-61fb69a35241],4d905db2-bfed-4f81-a700-61fb69a35241,fb4e4d5f-09a2-4096-a771-898ff88c019a
4,True,[5d1f1c84-b843-49d6-81c4-93b25d75cf47],5d1f1c84-b843-49d6-81c4-93b25d75cf47,28e572c3-8e24-4f40-85a8-c47bdc9b7d94


In [25]:
#Count how many is_hit true and divide by total number of queries

e5_small_eval_df.is_hit.sum() / len(e5_small_eval_df)

0.58

In [133]:
#Return the rows with is_hit == False
e5_small_eval_df[e5_small_eval_df.is_hit == False]

Unnamed: 0,is_hit,retrieved,expected,query
11,False,[c9112a99-5aca-43fd-9350-e8f2a6c85023],94b9fa70-2da2-4f2b-b41b-dcf1ad9353d8,a1dc3a85-1602-4403-b90f-87f59086f96d
12,False,[c9112a99-5aca-43fd-9350-e8f2a6c85023],89a74290-f47d-4623-860c-6f1164681ae8,785f7312-b806-45d1-b896-5b08b35366fd
13,False,[560545c6-62ff-420c-a09a-ae75e179d831],89a74290-f47d-4623-860c-6f1164681ae8,5a5e052d-92d2-4fd2-9196-9fbf1dd6f00e
21,False,[a009ed0a-f566-4838-95bf-37416447924e],a7fd88ad-2e38-43de-a96e-81bf6542cb26,e20d599c-aa42-49ed-be26-c1684cf8c0b3
22,False,[89a74290-f47d-4623-860c-6f1164681ae8],2b02f79f-3328-416f-ba5a-31c2561f530c,e60be730-2a61-4393-bb1f-22db589dd900
...,...,...,...,...
190,False,[f5ee60a2-0bc2-48e1-915d-f9e0aed2c927],01a24e38-3494-4553-9ca8-1d7bd37d7b8e,5e9d792f-0c9b-47f8-80b1-82aa317259cf
191,False,[f5ee60a2-0bc2-48e1-915d-f9e0aed2c927],01a24e38-3494-4553-9ca8-1d7bd37d7b8e,4afcca0c-0d08-4e32-88da-2250d678212a
196,False,[7c75d96b-8b60-4e64-81f3-ae6e3afc9e44],ee84b63b-e6fe-4132-8462-869aea4a97af,9595133a-e10d-49d5-879e-60d5d399c028
197,False,[7c75d96b-8b60-4e64-81f3-ae6e3afc9e44],ee84b63b-e6fe-4132-8462-869aea4a97af,4e4acfeb-b576-45ef-ba34-2513cf43a15f


In [75]:
e5_small_eval_k3 = evaluate(test_dataset, embed_model=embed_model_hf, top_k=3, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [76]:
e5_small_eval_k3_df = pd.DataFrame(e5_small_eval_k3)
e5_small_eval_k3_df.is_hit.sum() / len(e5_small_eval_k3_df)

0.82

### E5 Large

In [13]:
embed_model_hf_large = HuggingFaceEmbedding(
    model_name='intfloat/multilingual-e5-large',
    normalize=True,
    #Instruction to prepend query
    query_instruction='query:',
    #Instruction to prepend text
    text_instruction='passage:'
)

In [135]:
e5_large_eval = evaluate(test_dataset, embed_model=embed_model_hf_large, top_k=1, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [136]:
#Count how many is_hit true and divide by total number of queries
e5_large_eval_df = pd.DataFrame(e5_large_eval)
e5_large_eval_df.head()
e5_large_eval_df.is_hit.sum() / len(e5_large_eval)

0.635

In [14]:
e5_large_eval_k3 = evaluate(test_dataset, embed_model=embed_model_hf_large, top_k=3, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [15]:
#Count how many is_hit true and divide by total number of queries
e5_large_eval_k3_df = pd.DataFrame(e5_large_eval_k3)
e5_large_eval_k3_df.is_hit.sum() / len(e5_large_eval_k3)

0.865

### OpenAI

In [17]:
embed_model_oai= OpenAIEmbedding(
    mode="text_search",  # default, alternatively set to "similarity"
    model="text-embedding-ada-002"  # default
)

In [139]:
oai_ada_eval = evaluate(test_dataset, embed_model_oai, top_k=1, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [140]:
#Count how many is_hit true and divide by total number of queries
oai_ada_eval_df = pd.DataFrame(oai_ada_eval)
oai_ada_eval_df.head()
oai_ada_eval_df.is_hit.sum() / len(oai_ada_eval_df)

0.655

In [18]:
oai_ada_k3_eval = evaluate(test_dataset, embed_model_oai, top_k=3, verbose=True)
#Count how many is_hit true and divide by total number of queries
oai_ada_eval_k3_df = pd.DataFrame(oai_ada_k3_eval)
oai_ada_eval_k3_df.is_hit.sum() / len(oai_ada_eval_k3_df)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

0.855

### Cohere V3

In [48]:
#Cohere
from llama_index.embeddings.cohereai import CohereEmbedding
import cohere

co = cohere.Client(COHERE_API_KEY)

In [58]:
# with input_typ='search_query'
embed_model_cohere = CohereEmbedding(
    cohere_api_key=COHERE_API_KEY,
    model_name="embed-multilingual-v3.0",
    input_type="search_document",
)

In [59]:
from llama_index.embeddings import cohereai
import importlib
importlib.reload(cohereai)
from llama_index.embeddings.cohereai import CohereEmbedding

In [60]:
# with input_typ='search_query'
embed_model_cohere_light = CohereEmbedding(
    cohere_api_key=COHERE_API_KEY,
    model_name="embed-multilingual-light-v3.0",
    input_type="search_document"
)

In [150]:
cohere_eval = evaluate(test_dataset, embed_model=embed_model_cohere, top_k=1, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [153]:
#Count how many is_hit true and divide by total number of queries
cohere_eval_df = pd.DataFrame(cohere_eval)
cohere_eval_df.head()
cohere_eval_df.is_hit.sum() / len(cohere_eval_df)

0.665

In [21]:
cohere_eval_k3 = evaluate(test_dataset, embed_model=embed_model_cohere, top_k=3, verbose=True)
cohere_eval_k3_df = pd.DataFrame(cohere_eval_k3)
cohere_eval_k3_df.is_hit.sum() / len(cohere_eval_k3_df)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

0.855

In [61]:
cohere_eval_k3_light = evaluate(test_dataset, embed_model=embed_model_cohere_light, top_k=3, verbose=True)
cohere_eval_k3_light_df = pd.DataFrame(cohere_eval_k3_light)
cohere_eval_k3_light_df.is_hit.sum() / len(cohere_eval_k3_light_df)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

0.88

## Extract the wrong question and passage pairs (querry & context)

In [62]:
from sentence_transformers import InputExample

train_dataset = EmbeddingQAFinetuneDataset.from_json("data/train_dataset.json")

examples = []
for query_id, query in train_dataset.queries.items():
    node_id = train_dataset.relevant_docs[query_id][0]
    text = train_dataset.corpus[node_id]
    
    example = InputExample(texts=[query, text])
    examples.append(example)

In [64]:
cohere_eval_k3_light_df.head()

Unnamed: 0,is_hit,retrieved,expected,query
0,True,"[03c49ab1-609f-4fe8-bdd3-62ab0c4182fc, c3e3aaf...",03c49ab1-609f-4fe8-bdd3-62ab0c4182fc,d61ae5cf-c632-4bee-9b55-489bfc4d5c1d
1,True,"[03c49ab1-609f-4fe8-bdd3-62ab0c4182fc, 1543449...",03c49ab1-609f-4fe8-bdd3-62ab0c4182fc,3d726de4-9a35-4f9c-a537-47333a94d967
2,True,"[4d905db2-bfed-4f81-a700-61fb69a35241, 01a24e3...",4d905db2-bfed-4f81-a700-61fb69a35241,5c9730a0-60ef-42ad-aa05-a29b9aeb1c8a
3,True,"[4d905db2-bfed-4f81-a700-61fb69a35241, 6250f6f...",4d905db2-bfed-4f81-a700-61fb69a35241,fb4e4d5f-09a2-4096-a771-898ff88c019a
4,True,"[5d1f1c84-b843-49d6-81c4-93b25d75cf47, 3aacf45...",5d1f1c84-b843-49d6-81c4-93b25d75cf47,28e572c3-8e24-4f40-85a8-c47bdc9b7d94


In [69]:
#Filter and display the rows where is_hit is False
cohere_eval_k3_light_df[cohere_eval_k3_light_df.is_hit == False]

Unnamed: 0,is_hit,retrieved,expected,query
12,False,"[c9112a99-5aca-43fd-9350-e8f2a6c85023, 189c5e7...",89a74290-f47d-4623-860c-6f1164681ae8,785f7312-b806-45d1-b896-5b08b35366fd
13,False,"[189c5e70-12fc-4054-8f9f-197bafd087bf, 3f4fbb9...",89a74290-f47d-4623-860c-6f1164681ae8,5a5e052d-92d2-4fd2-9196-9fbf1dd6f00e
40,False,"[3f4fbb9c-cc49-429c-9d06-40982463bff1, 5cf0e29...",24c7080c-e093-4efe-ab5b-ed951f70e2af,576d5998-ebdf-4ab9-b96c-95fa56499607
43,False,"[3f4fbb9c-cc49-429c-9d06-40982463bff1, 5cf0e29...",1ae3b5a2-a2da-40e9-9e92-a2c5a3db85a5,62bd527f-17da-4a4b-8808-822d043ff791
67,False,"[f7f6787f-0d43-4247-8527-6642e48d52cc, 767a249...",81b84de0-0ffe-4b5c-b8c1-da690166fc6a,f7abfae4-d0a0-42c9-9642-79b9133d665b
69,False,"[767a249d-d1d2-478c-8ef4-1aa4f9b471f6, a7fd88a...",8f3d5a4f-9fa8-4bd8-ad67-6e44c9225add,e55dabb6-c047-4080-a697-b9e5888193c5
81,False,"[c3e3aaf0-77c5-47ec-92c2-9d987a5c62b7, 81badc9...",fb3544b1-e418-46ea-b0f4-83c2166afb27,3469d908-c8e2-4ea5-b072-916e26a11584
86,False,"[853a197c-e154-4b04-98e8-b160b4003699, 953890d...",4dfcb00b-9d73-4153-9389-b20dff850896,eecf2142-d92e-4af3-80c9-658e4cc32128
99,False,"[2c21c9d8-17d0-49d4-afc5-746c042035aa, 6fb6913...",a0481927-cc50-405e-9c05-4bf129f3ebf9,09f790bd-2684-4d2f-918a-c1d84332f54b
100,False,"[c29d2276-1c16-499f-8d79-f034557ec12a, a048192...",041dbad1-de47-4d9b-8777-9b5b3443e43e,36655037-36b4-4a7a-8a5a-5b6004e699c4


In [70]:
#Look up the query id of row 1 in cohere_eval_k3_light_df and return the query (text) from the test dataset
query_id = cohere_eval_k3_light_df.iloc[13]['query']
query = test_dataset.queries[query_id]
query


'Hvad er nogle af de helbredsrisici, der kan opstå som følge af ensidigt og belastende arbejde?'

In [71]:
#Print the expected text (the text that the query should return) by looking up the expected id from row 1 in the test dataset
expected_id = test_dataset.relevant_docs[query_id][0]
expected_text = test_dataset.corpus[expected_id]
expected_text

'Det er især udbredt i fremstillingsfag, fx i nærings- og nydelsesmiddelindustrien og i forbindelse med fremstilling af varer.\nEBA forekommer også i håndværksfag, inden for kontor og administration, i transportsektoren, fx ved chauffør- og transportarbejde, og i en række servicefunktioner som rengørings-, frisør- og tandlægearbejde.\n1.2.Hvornår er EBA et problem?\nLavintensive påvirkninger udgør normalt ingen risiko, hvis de kun forekommer i kort tid. Men hvis disse påvirkninger forekommer lang tid i forbindelse med det daglige arbejde, kan EBA udgøre en helbredsrisiko.\nJo mere intensive belastningerne er, desto kortere skal varigheden være, før der er tale om en helbredsrisiko.\n2.Helbredsrisici og risikobelastning\n2.1.Helbredsrisici\nVed ensidigt, belastende arbejde kan der på kortere eller længere sigt opstå træthed, ømhed, smerter, hævelse og stivhed i muskler, sener og led, ofte med nedsat funktionsevne, nedsat opmærksomhed samt øget risiko for ulykker.'

In [72]:
#Print the retrieved texts (the 3 text that the query actually returned) by looking up the retrieved id from row 1 in the test dataset
retrieved_ids = cohere_eval_k3_light_df.iloc[13]['retrieved']
retrieved_texts = [test_dataset.corpus[id_] for id_ in retrieved_ids]
retrieved_texts

['Årsagen til, at man kan få skader som følge af ensidigt, belastende arbejde, menes at hænge sammen med en relativt langvarig spænding af musklerne, tryk på eller strækning af vævet, påvirkning af blodforsyning eller nerver. Skaderne forekommer især i bevægeapparatet - det vil sige kroppens muskler, sener, led og tilhørende kar og nervevæv. Skaderne ses hyppigst i nakke, skuldre, arme og hænder, men kan opstå i alle områder af kroppen. Ensidigt, belastende arbejde kan også være årsag til psykiske symptomer, fx stress, monotonitilstand, psykisk træthed og nedsat vitalitet.\n2.2.Risikobelastning\nFølgende forhold bestemmer, om ensidigt, belastende arbejde er en risikobelastning:\n\n–Arbejdets varighed pr. dag, uge eller år\n–Intensiteten af belastningen, fx hvor hyppigt belastningen gentages, graden af fastlåsning eller kravet til opmærksomhed\n–Andre faktorer, fx kraftkrav og arbejdsstilling.',
 'Arbejdet vurderes ikke som ensidigt, gentaget, hvis de ensartede bevægelser udføres mindre

## Other stuff

In [84]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [90]:
service_context = ServiceContext.from_defaults(embed_model= embed_model_hf)

## FIne tune

KeyError: 0

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model_hf = HuggingFaceEmbedding(
    model_name='intfloat/multilingual-e5-small',
    normalize=True,
    #Instruction to prepend query
    query_instruction='query:',
    #Instruction to prepend text
    text_instruction='passage:'
)

In [23]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="intfloat/multilingual-e5-small",
    model_output_path="models/test_model",
    val_dataset=test_dataset,
)

In [159]:
#Finetune
#finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/60 [00:00<?, ?it/s]

Iteration:   0%|          | 0/60 [00:00<?, ?it/s]

In [24]:
#Load the model
finetuned_model = finetune_engine.get_finetuned_model()

In [19]:
finetuned = "local:models/test_model"
val_results_finetuned = evaluate(test_dataset, embed_model=finetuned_model, top_k=1, verbose=True)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [20]:
#Count how many is_hit true and divide by total number of queries
e5_small_finetuned_df = pd.DataFrame(val_results_finetuned)
e5_small_finetuned_df.head()
e5_small_finetuned_df.is_hit.sum() / len(e5_small_finetuned_df)

0.625

In [74]:
finetuned = "local:models/test_model"
e5_small_k3_finetuned = evaluate(test_dataset, embed_model=finetuned, top_k=3, verbose=True)
e5_small_k3_finetuned = pd.DataFrame(e5_small_k3_finetuned)
e5_small_k3_finetuned.is_hit.sum() / len(e5_small_eval_k3_df)

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

0.845

## Evaluation


In [85]:
#Gather all the results from the k3 evaluations in a single dataframe, calculated as .sum/len for each model>
#Also include the dimensions of the embedding models as a column
#Have the model name as a column as well and the score as a column
#
"""
'model_name'= ['e5_small', 'e5_large', 'oai_ada', 'cohere', 'cohere_light', 'e5_small_finetuned'],
    'model_dim'= [384, 1024, 1024, 1024, 384, 384],
    'score'= [e5_small_eval_k3_df.is_hit.sum() / len(e5_small_eval_k3_df), e5_large_eval_k3_df.is_hit.sum() / len(e5_large_eval_k3_df), oai_ada_eval_k3_df.is_hit.sum() / len(oai_ada_eval_k3_df), cohere_eval_k3_df.is_hit.sum() / len(cohere_eval_k3_df), cohere_eval_k3_light_df.is_hit.sum() / len(cohere_eval_k3_light_df), e5_small_k3_finetuned.is_hit.sum() / len(e5_small_k3_finetuned)]
"""

results = pd.DataFrame({
    'model_name': ['e5_small', 'e5_large', 'OpenAI_ada', 'Cohere v3 large', 'cohere v3 small', 'e5_small_finetuned'],
    'model_dim': [384, 1024, 1024, 1024, 384, 384],
    "k3 % correct": [e5_small_eval_k3_df.is_hit.sum() / len(e5_small_eval_k3_df), e5_large_eval_k3_df.is_hit.sum() / len(e5_large_eval_k3_df), oai_ada_eval_k3_df.is_hit.sum() / len(oai_ada_eval_k3_df), cohere_eval_k3_df.is_hit.sum() / len(cohere_eval_k3_df), cohere_eval_k3_light_df.is_hit.sum() / len(cohere_eval_k3_light_df), e5_small_k3_finetuned.is_hit.sum() / len(e5_small_k3_finetuned)]
})

results

Unnamed: 0,model_name,model_dim,k3 % correct
0,e5_small,384,0.82
1,e5_large,1024,0.865
2,OpenAI_ada,1024,0.855
3,Cohere v3 large,1024,0.855
4,cohere v3 small,384,0.88
5,e5_small_finetuned,384,0.845


# Build retrieval myself

## E5 large finetune / Crashes python

In [None]:
embed_model_hf_large = HuggingFaceEmbedding(
    model_name=embed_model_ft,
    normalize=True,
    #Instruction to prepend query
    query_instruction='query:',
    #Instruction to prepend text
    text_instruction='passage:'
)

In [21]:
finetune_engine_large = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="intfloat/multilingual-e5-large",
    model_output_path="models/e5_large",
    val_dataset=test_dataset,
)

In [22]:
finetune_engine_large.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/60 [00:00<?, ?it/s]

: 

In [None]:
embed_model_ft_large = finetune_engine.get_finetuned_model()

In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
import torch

base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model_output_test",
    # bias=True,
    epochs=4,
    verbose=True,
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.01}
)

In [77]:
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)