In [26]:
import pandas as pd
import logging
import random
import json
import re
import os


# function to read the jsonl file containing the training samples with meta data
def read_jsonl_file(filepath):
    data = []
    with open(filepath, "r") as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON in file {filepath}: {line}")

    print(f"Read {len(data)} records from {filepath}")

    return data

def read_jsonl_folder(folder):
    data = {}
    for filename in os.listdir(folder):
        if filename.endswith('.jsonl'):
            filepath = os.path.join(folder, filename)
            records = read_jsonl_file(filepath)
            for record in records:
                id = record.get('id')
                if id is not None:
                    
                    data[id] = {
                        "system_message": record.get('system_message'),
                        "prompt": record.get('prompt'),
                        "response": record.get('response'),
                        "category": record.get('meta', {}).get('task_info', {}).get('category'),
                        "subcategory": record.get('meta', {}).get('task_info', {}).get('sub_category'),
                        "language": record.get('meta', {}).get('task_info', {}).get('language'),
                    }
    return list(data.values())


Read 2528 records from data/llm_tasks\classification_agendapunt_openai_dataset_meta.jsonl
Read 17 records from data/llm_tasks\classification_bpmn_openai_dataset_meta.jsonl
Read 1429 records from data/llm_tasks\translate_agendapunten_openai_dataset_meta.jsonl
Read 3099 records from data/llm_tasks\translate_agendapunt_openai_dataset_meta.jsonl
Read 31 records from data/llm_tasks\translate_bpmn_openai_dataset_meta.jsonl


In [27]:
#formatting for Training and Inference

def format_message(header, message):
    """
    Formats a message in the Llama format.

    Parameters:
    header (str): The header for the message (e.g., 'system', 'user', 'assistant').
    message (str): The message to format.

    Returns:
    str: The formatted message.
    """
    return f"<|start_header_id|>{header}<|end_header_id|>{message}<|eot_id|>"

def get_prompt(system_message, prompt):
    """
    Formats a system message and a user prompt in the Llama format.

    Parameters:
    system_message (str): The system message to format.
    prompt (str): The user prompt to format.

    Returns:
    str: The formatted system message and user prompt.
    """
    return format_message('system', system_message) + format_message('user', prompt)

def get_sample(system_message, prompt, output):
    """
    Formats a system message, a user prompt, and an assistant output in the Llama format.

    Parameters:
    system_message (str): The system message to format.
    prompt (str): The user prompt to format.
    output (str): The assistant output to format.

    Returns:
    str: The formatted system message, user prompt, and assistant output.
    """
    return get_prompt(system_message, prompt) + format_message('assistant', output)

In [28]:


def clean_json_string(json_string):
    pattern = r'^```json\s*(.*?)\s*```$'
    cleaned_string = re.sub(pattern, r'\1', json_string, flags=re.DOTALL)
    return cleaned_string.strip()

def clean_tasks(tasks):
    cleaned_tasks = []

    for task in tasks:
        if 'response' not in task or task['response'] is None:
            continue
        response = clean_json_string(task['response'])
        try:
            json_response = json.loads(response)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON in response: {response}")
            continue
        cleaned_tasks.append(task)

    return cleaned_tasks

def balance_tasks(tasks, max_per_freq=150):
    frequency = {}
    balanced_tasks = []

    for task in tasks:
        response = task['response']
        json_response = json.loads(response)
        num_translations = len(json_response['translations'])
        if num_translations not in frequency:
            frequency[num_translations] = 0
        if frequency[num_translations] < max_per_freq:
            frequency[num_translations] += 1
            balanced_tasks.append(task)

    sorted_frequency = dict(sorted(frequency.items(), key=lambda item: item[0]))
    print(sorted_frequency)

    return balanced_tasks

def store_tasks(tasks, output_file):
    with open(output_file, 'w') as f:
        for task in tasks:
            json.dump(task, f)
            f.write('\n')

def clean_balance_and_store_tasks(input_file, output_file, max_per_freq=150):
    tasks = read_jsonl_file(input_file)

    # Shuffle the tasks
    random.shuffle(tasks)

    cleaned_tasks = clean_tasks(tasks)
    balanced_tasks = balance_tasks(cleaned_tasks, max_per_freq)
    store_tasks(balanced_tasks, output_file)

# Usage
#clean_balance_and_store_tasks("data/llm_tasks/translate_agendapunten_openai_dataset.jsonl", "data/llm_tasks/translate_agendapunten_openai_dataset_cleaned.jsonl")


In [30]:
from abc import ABC, abstractmethod
from datasets import Dataset, DatasetDict


class InstructDataset(ABC):
    """
    Abstract class for creating Instruct Datasets
    """

    def __init__(self, dataset: pd.DataFrame):
        """
        Initialize the dataset
        :param dataset: The pandas DataFrame
        """
        self.dataset = dataset
    
    def load_dataset(self, dataset: pd.DataFrame) -> None:
        """
        Load the dataset from the given DataFrame
        :param dataset: The pandas DataFrame
        :return: None
        """
        self.dataset = dataset

    def rename_columns(self, columns: dict[str, str]) -> None:
        """
        Rename the columns of the dataset
        :param columns: A dictionary of the form {old_name: new_name}
        :return: None
        """
        self.dataset = self.dataset.rename(columns=columns)

    def drop_columns(self, columns: list[str]) -> None:
        """
        Drop the columns from the dataset
        :param columns: A list of column names to drop
        :return: None
        """
        drop_columns = [col for col in columns if col in self.dataset.columns]
        self.dataset = self.dataset.drop(columns=drop_columns)

    def drop_bad_rows(self, columns: list[str]) -> None:
        """
        Drop the rows which have bad values in the columns
        :param columns: A list of columns to check for bad values
        :return: None
        """
        self.dataset = self.dataset.dropna(subset=columns)
        self.dataset = self.dataset.drop_duplicates(subset=columns)

    def create_instruction(self, instruction: str) -> None:
        """
        Create an instruction column in the dataset
        :param instruction: The instruction to add to the dataset
        :return: None
        """
        self.dataset["instruction"] = instruction

    @abstractmethod
    def create_prompt(self) -> None:
        """
        Create the prompt column in the dataset
        :return: None
        """
        pass

    def get_dataset(self) -> pd.DataFrame:
        """
        Get the dataset
        :return: The dataset
        """
        return self.dataset

class Llama3InstructDataset(InstructDataset):

    def create_prompt(self):
        """
        Create the prompt column in the dataset which will be used for
        """
        prompts = []
        for index, row in self.dataset.iterrows():
            prompt = f"""<|start_header_id|>system<|end_header_id|> {row['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|>
              This is the question: {row['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {row['output']}<|eot_id|>"""
            prompts.append(prompt)
        self.dataset["prompt"] = prompts


In [31]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
import pandas as pd
import logging
from datasets import DatasetDict, Dataset

# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Define the folder containing the data
folder = "data/llm_tasks"

# Read the data from the folder
data = read_jsonl_folder(folder)

# Convert the data to a pandas DataFrame and shuffle it
finetuning_df = pd.DataFrame(data)
finetuning_df = finetuning_df.sample(frac=1).reset_index(drop=True)

# Print the first few rows of the DataFrame
print(finetuning_df.head(5))

# Create a Llama3InstructDataset from the DataFrame
dataset_abb = Llama3InstructDataset(finetuning_df)

# Define the columns to remove and the new names for the remaining columns
REMOVE_COLUMNS = []
RENAME_COLUMNS = {"system_message": "instruction","prompt":"input", "response": "output"}

def process_dataset(dataset) -> pd.DataFrame:
    """
    Process the instruct dataset to be in the format required by the model.
    :param dataset: The dataset to process.
    :return: The processed dataset.
    """
    # Remove unnecessary columns
    dataset.drop_columns(REMOVE_COLUMNS)
    logger.info("Columns removed!")

    # Rename the remaining columns
    dataset.rename_columns(RENAME_COLUMNS)
    logger.info("Columns renamed!")

    # Drop rows with missing values in the 'input' and 'output' columns
    dataset.drop_bad_rows(["input", "output"])
    logger.info("Bad rows dropped!")

    # Create the prompt column
    dataset.create_prompt()
    logger.info("Prompt column created!")

    return dataset.get_dataset()

def create_dataset_hf(dataset: pd.DataFrame) -> DatasetDict:
    """
    Create a Hugging Face dataset from the pandas dataframe.
    :param dataset: The pandas dataframe.
    :return: The Hugging Face dataset.
    """
    # Shuffle the dataset
    dataset.sample(frac=1)
    dataset.reset_index(drop=True, inplace=True)

    # Convert the pandas DataFrame to a Hugging Face Dataset
    dataset_hf = Dataset.from_pandas(dataset)

    # Split the dataset into train, test, and validation sets
    train_test_split = dataset_hf.train_test_split(test_size=0.1)
    train_val_split = train_test_split['train'].train_test_split(test_size=0.1)

    return DatasetDict({
        "train": train_val_split['train'],
        "test": train_test_split['test'],
        "validation": train_val_split['test']
    })

# Process the dataset and create a Hugging Face dataset from it
dataset = process_dataset(dataset_abb)
dataset_hf = create_dataset_hf(dataset)

# Uncomment the following line to push the dataset to the Hugging Face Hub
# dataset_hf.push_to_hub(f"llama3_{dataset_name}_instruct_dataset")

Read 2528 records from data/llm_tasks\classification_agendapunt_openai_dataset_meta.jsonl
Read 17 records from data/llm_tasks\classification_bpmn_openai_dataset_meta.jsonl
Read 1429 records from data/llm_tasks\translate_agendapunten_openai_dataset_meta.jsonl
Read 3099 records from data/llm_tasks\translate_agendapunt_openai_dataset_meta.jsonl
Read 31 records from data/llm_tasks\translate_bpmn_openai_dataset_meta.jsonl
                                      system_message  \
0  Your task is to generate responses in JSON for...   
1  Your task is to generate responses in JSON for...   
2  Your task is to generate responses in JSON for...   
3  Your task is to generate responses in JSON for...   
4  Your task is to generate responses in JSON for...   

                                              prompt  \
0  ####\nContext: {"uri": "https://hoogstraten.me...   
1  ####\nContext: [{"uri": "http://data.lblod.inf...   
2  ####\nContext: {"uri": "http://data.lblod.info...   
3  ####\nContext: 