In [234]:
import datasets  
dataset_name = "DiscoResearch/germanrag"  
dataset = datasets.load_dataset(dataset_name) 
dataset


DatasetDict({
    train: Dataset({
        features: ['contexts', 'question', 'answer', 'positive_ctx_idx'],
        num_rows: 3362
    })
})

In [235]:
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import os
import dotenv
dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

#test client connectivity
response = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of Germany?"}])
response



[07/Mar/2024 03:48:09] INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


ChatCompletion(id='chatcmpl-8zy5gZl7vCmobeBkz02xaeWlGccjZ', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The capital of Germany is Berlin.', role='assistant', function_call=None, tool_calls=None))], created=1709779688, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_2b778c6b35', usage=CompletionUsage(completion_tokens=7, prompt_tokens=24, total_tokens=31))

In [236]:
def get_embeddings(texts, model="text-embedding-ada-002"):
    """
    Fetch embeddings for a list of texts using OpenAI's API.
    """
    embeddings = []
    for text_batch in np.array_split(texts, max(1, len(texts) // 20)):  # Splitting texts into manageable batches
        response = client.embeddings.create(input=text_batch.tolist(), model=model)
        embeddings += [r.embedding for r in response.data]
    return np.array(embeddings)

def compute_similarity_scores(embeddings_1, embeddings_2):
    """
    Compute cosine similarity scores between two sets of embeddings.
    """
    # Normalize embeddings to unit length
    embeddings_1 = normalize(embeddings_1)
    embeddings_2 = normalize(embeddings_2)
    
    return np.dot(embeddings_1, embeddings_2.T)

In [237]:
import hashlib
def generate_hash(*contexts):
    """Generate a hash for a combination of contexts."""
    concatenated_contexts = ''.join(sorted(contexts))  # Sort contexts to ensure order doesn't affect hash
    return hashlib.md5(concatenated_contexts.encode('utf-8')).hexdigest()

In [238]:
import numpy as np
import pandas as pd
from itertools import chain

def filter_candidate_contexts(question_embedding, candidate_contexts_df, pos_context_embedding, hard_negatives_embeddings, similarity_intervals):
    """
    Filters candidate contexts based on similarity scores to the question, positive context, and hard negatives.

    Parameters:
    - question_embedding: The embedding of the current question.
    - candidate_contexts_df: DataFrame of candidate contexts with their embeddings.
    - pos_context_embedding: The embedding of the positive context for the current question.
    - hard_negatives_embeddings: List of embeddings for hard negative contexts.
    - similarity_intervals: A dict containing similarity intervals for question, positive, and hard negatives.

    Returns:
    - A filtered DataFrame of candidate contexts based on the provided similarity intervals.
    """
    # Calculate similarity scores
    candidate_contexts_df= candidate_contexts_df.copy()
    similarity_to_question = compute_similarity_scores(question_embedding.reshape(1, -1), candidate_contexts_df['embeddings'].tolist())
    similarity_to_question = list(chain.from_iterable(similarity_to_question))
    
    similarity_to_positive = compute_similarity_scores([pos_context_embedding], candidate_contexts_df['embeddings'].tolist())
    similarity_to_positive = list(chain.from_iterable(similarity_to_positive))
    
    if hard_negatives_embeddings:
        similarity_to_negatives = compute_similarity_scores(np.array(hard_negatives_embeddings), np.array(candidate_contexts_df['embeddings'].tolist()))
        similarity_to_negatives = np.max(similarity_to_negatives, axis=0)
        candidate_contexts_df['similarity_to_negatives'] = similarity_to_negatives
    
    # Assign similarity scores to DataFrame
    candidate_contexts_df['similarity_to_question'] = similarity_to_question
    candidate_contexts_df['similarity_to_positive'] = similarity_to_positive

    # Filter based on similarity intervals
    q_interval = similarity_intervals['question']
    p_interval = similarity_intervals['positive']
    filtered_df = candidate_contexts_df[(candidate_contexts_df['similarity_to_question'] >= q_interval[0]) & 
                                        (candidate_contexts_df['similarity_to_question'] <= q_interval[1]) &
                                        (candidate_contexts_df['similarity_to_positive'] >= p_interval[0]) & 
                                        (candidate_contexts_df['similarity_to_positive'] <= p_interval[1])]
    
    if hard_negatives_embeddings:
        n_interval = similarity_intervals['hard_negative']
        filtered_df = filtered_df[(filtered_df['similarity_to_negatives'] >= n_interval[0]) & 
                                  (filtered_df['similarity_to_negatives'] <= n_interval[1])]
    
    return filtered_df


In [239]:
import pandas as pd
import logging
from itertools import chain

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_contexts_in_dataframe(dataset, model):
    all_contexts = {'context': [], 'question_id':[], 'embeddings':[]}
    for i, row in enumerate(dataset['train']):
        for context in row['contexts']:
            all_contexts['context'].append(context)
            all_contexts['question_id'].append(i)

    context_embeddings_column = get_embeddings(all_contexts['context'], model=model)
    all_contexts['embeddings'] = context_embeddings_column.tolist()
    return  pd.DataFrame(all_contexts)


def add_easy_negatives(dataset, model="text-embedding-ada-002", question_similarity_interval=(0.5, 0.75), hard_negative_similarity_interval=(0.5, 0.75), positive_similarity_interval=(0.5, 0.75), max_negative_examples=5):
    """
    Add easy negative examples to a dataset by finding similar contexts for each question.
    """
    all_contexts = load_contexts_in_dataframe(dataset,model)
    logger.info(f"Loaded {len(all_contexts)} contexts for {len(dataset['train'])} questions.")
    unique_combinations_set = set()
    assert len(all_contexts) == len(all_contexts['context'])
    assert all_contexts['question_id'].nunique() == len(dataset['train'])
    
    all_questions = dataset['train']['question']
    question_embeddings = get_embeddings(all_questions, model=model)

    easy_negatives=[]
    for i, row in enumerate(dataset['train']):
        condition = all_contexts['question_id'] == i
        current_row_contexts = all_contexts[condition]
        candidate_contexts = all_contexts[~condition]
        assert len(all_contexts) == len(current_row_contexts) + len(candidate_contexts)

        pos_idx=row['positive_ctx_idx']
        similarity_intervals = {
            'question': question_similarity_interval,
            'positive': positive_similarity_interval,
            'hard_negative': hard_negative_similarity_interval
        }

        pos_context_embedding = current_row_contexts.iloc[pos_idx]['embeddings']
        hard_negatives_embeddings = [emb for i, emb in enumerate(current_row_contexts['embeddings']) if i != pos_idx]

        candidate_contexts = filter_candidate_contexts(question_embeddings[i], candidate_contexts, pos_context_embedding, hard_negatives_embeddings, similarity_intervals)

        # checking unicity of the combination
        easy_negative_contexts=candidate_contexts['context'].tolist()[:max_negative_examples]
        context_combinations_hash = generate_hash(*(easy_negative_contexts + row['contexts']))
        while context_combinations_hash in unique_combinations_set and len(easy_negative_contexts) > 0:
            easy_negative_contexts.pop()
            context_combinations = easy_negative_contexts + row['contexts']
            context_combinations_hash = generate_hash(*context_combinations)

        easy_negatives.append(easy_negative_contexts)
        unique_combinations_set.add(context_combinations_hash)
        logger.info(f"Successfully added easy negatives for question {i}.")
    dataset['train'] = dataset['train'].add_column("easy_negatives", easy_negatives)
    return dataset



dataset10 = {'train': dataset['train'].select(range(100))}
#we can use text-embedding-3-large for better results default set to text-embedding-ada-002
new_dataset = add_easy_negatives(dataset10, question_similarity_interval=(0.2, 0.75), hard_negative_similarity_interval=(0.2, 0.8), positive_similarity_interval=(0.2, 0.75), max_negative_examples=5)['train']
print("first question", new_dataset['question'][0])
print("first easy negative", new_dataset['easy_negatives'][0])
print("second question", new_dataset['question'][1])
print("second easy negative", new_dataset['easy_negatives'][1])


[07/Mar/2024 03:48:09] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:11] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:12] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:13] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:14] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:15] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:16] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:17] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:18] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[07/Mar/2024 03:48:19] INFO - HTTP Request: POST https://api.ope

In [240]:
import json

primary_contexts = []
lists_of_hard_negatives = []
for i, row in enumerate(new_dataset):
    primary_contexts.append(row['contexts'][row['positive_ctx_idx']])
    lists_of_hard_negatives.append([row['contexts'][idx] for idx in range(len(row['contexts'])) if idx != row['positive_ctx_idx']])


def prepare_dataset_entry(question, context, hard_negatives, easy_negatives, answer):
    hard_negatives_formatted = '\n- '.join(hard_negatives)
    easy_negatives_formatted = '\n- '.join(easy_negatives)
    
    entry = {
        "context": f"Primary Context: {context}\n\nSupplementary Contexts:\nHard Negatives:\n- {hard_negatives_formatted}\nEasy Negatives:\n- {easy_negatives_formatted}",
        "question": question,
        "answer": answer
    }
    
    return entry

def export_to_jsonl(entries, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for entry in entries:
            file.write(json.dumps(entry) + '\n')

entries = []
for q, c, h_negs, e_negs, a in zip(new_dataset['question'], primary_contexts, lists_of_hard_negatives, new_dataset['easy_negatives'], new_dataset['answer']):
    entry = prepare_dataset_entry(q, c, h_negs, e_negs, a)
    entries.append(entry)

export_to_jsonl(entries, 'enriched_german_rag.jsonl')


In my approach to enhancing the germanrag dataset for ellamind, I've identified several areas where innovative techniques could significantly improve the dataset's utility for fine-tuning language models on German-language RAG applications. Here are my proposals:

1. Dynamic Difficulty Scaling: Recognizing the importance of progressively challenging the model to ensure continuous learning and adaptation, I propose developing an algorithm that dynamically adjusts the difficulty level of questions and associated negatives based on the model's evolving performance. This system would not only ensure that the model is always being pushed to its learning edge but also prevent it from being overwhelmed by too complex questions too soon. Implementing such an algorithm involves categorizing our dataset into tiers of difficulty and incrementally exposing the model to more complex questions as its accuracy and confidence improve.

2. Advanced Negative Selection: To further refine the model's ability to discern relevant from irrelevant or misleading information, I suggest an enhancement in our selection of negatives. This involves two key innovations:
Algorithmic Refinement: Deploying advanced algorithms that go beyond semantic similarity to include logical and thematic divergence from the question's focus. This could leverage deep learning techniques to assess not just the textual similarity but the contextual relevance and potential for confusion, ensuring the negatives are sophisticated and challenging.
Incorporation of Misinformation Negatives: In an era where misinformation is rampant, training the model to identify and disregard such content is crucial. I recommend including negatives that represent common misconceptions or misinformation within the dataset's domain. This strategy will not only improve the model's accuracy but also its applicability in real-world scenarios where discerning truth from falsehood is essential.

3. Inclusion of Meta-Data: Understanding the context in which information is presented is pivotal for assessing its relevance and credibility. To this end, I propose augmenting the dataset with meta-data that describes the source, reliability, and date of each context. This addition will enable the model to consider not just the content of the information but also its origin and timeliness, factors that are often critical in determining the accuracy and relevance of an answer.