# Clean and convert synthetic data
Notebook to clean and convert https://huggingface.co/collections/ThatsGroes/nordic-embedding-training-data-678f53542163a7eaf5d2194e to a unified and clean dataset for embedding finetuning

In [4]:
import os
from huggingface_hub import HfApi
from dotenv import load_dotenv
load_dotenv()
api = HfApi(token=os.getenv("HF_TOKEN"))


In [None]:
import re
from datasets import load_dataset, concatenate_datasets
import json_repair

def extract_similarity_scores(text: str):
    """
    Look for lines like:
      - "The similarity score between S1 and S2 should be 4"
      - "The similarity score between S1 and S3 should be 3.5"
    Returns a list of [score_S1S2, score_S1S3] as floats, or None if not found.
    """
    # Regex to find "S1 and S2 should be XXX"
    pattern_s2 = re.compile(r"s1\s+and\s+s2\s+should\s+be\s+(\d+(\.\d+)?)", re.IGNORECASE)
    # Regex to find "S1 and S3 should be XXX"
    pattern_s3 = re.compile(r"s1\s+and\s+s3\s+should\s+be\s+(\d+(\.\d+)?)", re.IGNORECASE)
    
    match_s2 = pattern_s2.search(text)
    match_s3 = pattern_s3.search(text)
    
    if match_s2 and match_s3:
        score_s2 = float(match_s2.group(1))
        score_s3 = float(match_s3.group(1))
        return [score_s2, score_s3]
    
    return None

def clean_extracted_text(text: str) -> str:
    """
    Cleans the input text by:
      - Removing literal "\\n" substrings and standalone backslashes.
      - Replacing newline characters with a space.
      - Removing all leading non-alphabetic characters.
      - Collapsing multiple spaces into one.
      - Removing any trailing characters after the last alphabetic character,
        unless they are one of: period (.), question mark (?), exclamation mark (!), or closing parenthesis ())
      - Handling empty or invalid input by returning an empty string.
    """
    if not text:
        return ""
        
    # Remove literal "\n" substrings and any remaining standalone backslashes.
    text = text.replace("\\n", "")
    text = text.replace("\\", "")
    # Replace newline characters with a space.
    text = text.replace("\n", " ")
    # Remove all leading characters until the first alphabetic character.
    text = re.sub(r'^[^A-Za-z]+', '', text)
    # If the result is empty after removing leading non-alphabetic characters, return empty string.
    if not text:
        return ""
    # Collapse multiple spaces into a single space.
    text = re.sub(r"\s+", " ", text)
    
    # Remove trailing characters beyond the last alphabetic character,
    # while preserving a trailing allowed punctuation if present.
    m = re.match(r'^(.*[A-Za-z])([.?!)]*)[^A-Za-z]*$', text)
    if m:
        text = m.group(1) + m.group(2)
    
    return text.strip()


def extract_assigned_instruction(text: str) -> str:
    """
    Extracts the substring following:
      - "You have been assigned a (text classification|retrieval|text matching) task"
    and ending right before "Your mission...".
    Cleans the extracted text.
    """
    pattern = re.compile(
        r"You have been assigned a (?:text classification|retrieval|text matching) task\s*[:]*\s*(.*?)\s*(?=Your mission)",
        flags=re.IGNORECASE | re.DOTALL
    )
    
    match = pattern.search(text)
    if not match:
        return None
    
    extracted = match.group(1)
    extracted = clean_extracted_text(extracted)
    return extracted

def has_cjk_characters(text):
    # Unicode ranges for CJK characters:
    # Chinese, Hiragana, Katakana, CJK Unified Ideographs Extension A & B, and CJK Compatibility Ideographs.
    cjk_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\U00020000-\U0002A6DF\uf900-\ufaff]'
    return bool(re.search(cjk_pattern, text))

def is_valid_json(example):
    # If "response" isn't even a string, fail immediately
    if not isinstance(example["response"], str):
        return False
    
    #cleaned = example["response"].strip().removeprefix("```json").removesuffix("```").strip()
    json_sample = json_repair.repair_json(example["response"], return_objects=True)
    if json_sample is None or json_sample == {}:
        print(f"Error parsing example: {example['response']}")
        return False

    # Check for CJK
    if json_sample is None or json_sample == {} or json_sample == "":
        return False
    try:
        values = list(json_sample.values())
    except:
        print(f"Error list conversion on json_sample: {json_sample}")
        return False

    # Ensure length >= 2
    if len(values) < 2:
        return False

    for value in values:
        if not isinstance(value, str):
            return False
        if has_cjk_characters(value):
            #print(f"CJK characters found in {value}")
            return False

    # If it passes all checks, it’s valid
    return True

def transform_function(json_sample, dataset_name, prompt):
    # we assume is_valid_json has already guaranteed it's valid
    values = list(json_sample.values())
    #If dataset name contains text-mat do not include neg

    new_data = {}

    task_match = re.search(r'synthetic-from-(.+?)-tasks-(?:danish|swedish|norwegian)', dataset_name)
    new_data["task"] = task_match.group(1) if task_match else None

    #Fix typos in the task name
    if new_data["task"] == "text-mathing-short":
        new_data["task"] = "text-matching-short"

    #Extract the instruction, for unit-triple tasks, extract the similarity scores, for all other tasks, extract the assigned instruction
    if new_data["task"] == "unit-triple":
        new_data["instruction"] = "Retrieve semantically similar text"
        new_data["triple_label"] = extract_similarity_scores(prompt)
    else:
        new_data["instruction"] = extract_assigned_instruction(prompt)

    new_data["query"] = values[0]
    new_data["positive"] = values[1]
    new_data["language"] = dataset_name.split("-")[-1]
    if new_data['task'] == 'retrieval' or new_data['task'] == 'unit-triple':
        if len(values) > 2 and values[2] is not None:
            new_data["negative"] = values[2]
        else:
            new_data["negative"] = ""
            
    return new_data


# List of Hugging Face repository URLs.
hf_repos = [
    "ThatsGroes/synthetic-from-retrieval-tasks-danish",
    "ThatsGroes/synthetic-from-classification-tasks-danish",
    "ThatsGroes/synthetic-from-unit-triple-tasks-danish",
    "ThatsGroes/synthetic-from-text-matching-long-tasks-danish",
    "ThatsGroes/synthetic-from-text-matching-short-tasks-danish",
    "ThatsGroes/synthetic-from-classification-tasks-swedish",
    "ThatsGroes/synthetic-from-unit-triple-tasks-swedish",
    "ThatsGroes/synthetic-from-retrieval-tasks-norwegian",  
    "ThatsGroes/synthetic-from-classification-tasks-norwegian",
    "ThatsGroes/synthetic-from-unit-triple-tasks-norwegian",
    "ThatsGroes/synthetic-from-retrieval-tasks-swedish",
    "ThatsGroes/synthetic-from-text-mathing-short-tasks-swedish",
    "ThatsGroes/synthetic-from-text-matching-long-tasks-swedish",
    "ThatsGroes/synthetic-from-text-matching-long-tasks-norwegian",
    "ThatsGroes/synthetic-from-text-mathing-short-tasks-norwegian"
]

hf_retrieval_tasks = [
    "ThatsGroes/synthetic-from-retrieval-tasks-danish",
    "ThatsGroes/synthetic-from-retrieval-tasks-swedish",
    "ThatsGroes/synthetic-from-retrieval-tasks-norwegian"
]

hf_classification_tasks = [
    "ThatsGroes/synthetic-from-classification-tasks-danish",
    "ThatsGroes/synthetic-from-classification-tasks-swedish",
    "ThatsGroes/synthetic-from-classification-tasks-norwegian"
]


hf_unit_triple_tasks = [
    "ThatsGroes/synthetic-from-unit-triple-tasks-danish",
    "ThatsGroes/synthetic-from-unit-triple-tasks-swedish",
    "ThatsGroes/synthetic-from-unit-triple-tasks-norwegian"
]

hf_text_matching_tasks = [
    "ThatsGroes/synthetic-from-text-mathing-short-tasks-swedish",
    "ThatsGroes/synthetic-from-text-mathing-short-tasks-danish",
    "ThatsGroes/synthetic-from-text-matching-long-tasks-swedish",
    "ThatsGroes/synthetic-from-text-matching-long-tasks-norwegian"
]


transformed_datasets = []

# Loop over each repository URL.
for repo in hf_repos:
    # Extract the dataset name from the URL (e.g., "synthetic-from-retrieval-tasks-danish")
    dataset_name = repo.split("/")[-1]
    
    # Load the dataset (adjust the split name if necessary)
    ds = load_dataset(repo, split="train")
    
    #Check that the response and prompt columns are strings
    ds = ds.filter(lambda x: isinstance(x["response"], str) and isinstance(x["prompt"], list))
    #Apply the filter is_valid_json to the response column
    ds = ds.filter(is_valid_json)

    def map_fn(example):
        json_sample = json_repair.repair_json(example["response"], return_objects=True)
        return transform_function(json_sample, dataset_name, str(example["prompt"]))
    
    #Drop the model column
    ds = ds.remove_columns(["model"])

    ds_transformed = ds.map(map_fn)

    #Can you reorder the columns to be query, pos, neg, language, task, instruction, prompt, response.
    #Currently response and prompt are the first two columns
    if "negative" in ds_transformed.column_names:
        column_order = [
            "query", "positive", "negative", "language", "task", 
            "instruction", 'triple_label,' "prompt", "response"
        ]
    else:
        column_order = [
            "query", "positive", "language", "task", 
            "instruction", "prompt", "response"
        ]

    ds_transformed = ds_transformed.select_columns(column_order)

    transformed_datasets.append(ds_transformed)
    # Save the transformed dataset to disk
    ds_transformed.save_to_disk(f"synthetic-supervised-dataset-{dataset_name}")

Map:   0%|          | 0/92783 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/92783 [00:00<?, ? examples/s]

Map:   0%|          | 0/97745 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/97745 [00:00<?, ? examples/s]

Map:   0%|          | 0/98865 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/98865 [00:00<?, ? examples/s]

Map:   0%|          | 0/94734 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/94734 [00:00<?, ? examples/s]

Map:   0%|          | 0/99382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/99382 [00:00<?, ? examples/s]

Map:   0%|          | 0/48937 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/48937 [00:00<?, ? examples/s]

Map:   0%|          | 0/49362 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49362 [00:00<?, ? examples/s]

Map:   0%|          | 0/46877 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46877 [00:00<?, ? examples/s]

Map:   0%|          | 0/48895 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/48895 [00:00<?, ? examples/s]

Map:   0%|          | 0/49497 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49497 [00:00<?, ? examples/s]

Map:   0%|          | 0/46668 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46668 [00:00<?, ? examples/s]

Map:   0%|          | 0/49695 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49695 [00:00<?, ? examples/s]

Map:   0%|          | 0/47458 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47458 [00:00<?, ? examples/s]

Map:   0%|          | 0/47660 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47660 [00:00<?, ? examples/s]

Map:   0%|          | 0/49691 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49691 [00:00<?, ? examples/s]

In [None]:
# Optionally, concatenate all transformed datasets into one final dataset.
final_dataset = concatenate_datasets(transformed_datasets)
final_dataset

Dataset({
    features: ['query', 'positive', 'negative', 'language', 'task', 'instruction', 'prompt', 'response'],
    num_rows: 968249
})

In [12]:
final_dataset.save_to_disk("synthetic-supervised-dataset")

Saving the dataset (0/5 shards):   0%|          | 0/920589 [00:00<?, ? examples/s]

In [6]:
# save to disk
final_dataset.save_to_disk("synthetic-supervised-dataset-2")

Saving the dataset (0/6 shards):   0%|          | 0/968249 [00:00<?, ? examples/s]

In [8]:
from huggingface_hub import HfApi
import os

api = HfApi(token=os.getenv("HF_TOKEN"))
final_dataset.push_to_hub("DDSC/nordic-embedding-training-data")

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/610 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DDSC/nordic-embedding-training-data/commit/5fda060ad9b0e446ba3240e515084ac8ef65fef0', commit_message='Upload dataset', commit_description='', oid='5fda060ad9b0e446ba3240e515084ac8ef65fef0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DDSC/nordic-embedding-training-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DDSC/nordic-embedding-training-data'), pr_revision=None, pr_num=None)