# Building training dataset from OASST2

# Preparation

## Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.convert_to_andalusian_spanish import AndalusianConversor
from transformers import AutoTokenizer
import plotly.express as px
import os

## Data

In [None]:
filename = os.path.join('data', 'raw', '2023-11-05_oasst2_all.messages.jsonl')
raw_df = pd.read_json(filename, lines=True)

# Filter for lang = 'es'
raw_df = raw_df[raw_df['lang'] == 'es']

# Drop unnecessary columns
to_drop = ['user_id', 'created_date', 'lang', 'emojis', 'model_name']
raw_df.drop(columns = to_drop, inplace = True)

print(raw_df.shape)
raw_df.head()

# Design working dataframe

## General format

In [None]:
# Create base copy
filtered_df = raw_df.copy()

# Drop rows where review_result is 1
filtered_df = filtered_df[filtered_df['review_result'] == 1]

# Keep needed columns
to_keep = ['message_id', 'text', 'role', 'message_tree_id', 'parent_id']
filtered_df = filtered_df[to_keep]

# Convert IDs to integers
id_dict = {k: v for v, k in enumerate(filtered_df['message_id'].unique(), start=1)}
to_map = ['message_id', 'message_tree_id', 'parent_id']
for col in to_map:
    filtered_df[col] = filtered_df[col].map(id_dict)

# Fill NaNs with 0
to_fill = ['message_tree_id', 'parent_id']
filtered_df[to_fill] = filtered_df[to_fill].fillna(0).astype(int)

# Set message_id as index
filtered_df.set_index('message_id', inplace=True)

print(filtered_df.shape)
filtered_df.head()

In [None]:
# Save checkpoint
checkpoint = {'filtered_df': filtered_df}

## Obtain 2-element conversations

In [None]:
# Load checkpoint
temp_df = checkpoint['filtered_df']

In [None]:
# For each row in aux_df, create a column with the IDs of the messages in the same thread
def get_path(row: pd.Series,
             df: pd.DataFrame = temp_df) -> list:
    """
    Generate a list of message IDs that are in the same thread as the current message.

    Args:
    row (pd.Series): A row from a DataFrame.

    Returns:
    list: A list of message IDs.
    """

    path = [row.name]
    parent_id = row['parent_id']
    while parent_id != 0:
        path.append(parent_id)
        parent_id = df.loc[parent_id, 'parent_id']

    # Reverse the list
    path = path[::-1]

    return path

temp_df['path'] = temp_df.apply(get_path, axis=1)
display(temp_df)

In [None]:
# Extract paths with 2 elements
mask = temp_df['path'].apply(len) == 2
two_element_paths = temp_df[mask]['path'].tolist()
print('Two-element paths:', len(two_element_paths))

In [None]:
def build_conversation_dict(path: list,
                       df: pd.DataFrame = temp_df) -> pd.DataFrame:
    """
    Build a conversation from a list of message IDs as a JSON object.

    Args:
    path (list): A list of message IDs.
    df (pd.DataFrame): A DataFrame with the messages.

    Returns:
    pd.DataFrame: A DataFrame with the conversation.
    """

    conversation_dict = {
        'input': df.loc[path[0], 'text'],
        'output': df.loc[path[1], 'text'],
        'path': path,
    }

    return conversation_dict

conversation_dict_list = [build_conversation_dict(path) for path in two_element_paths]
conversation_df = pd.DataFrame(conversation_dict_list)
print(conversation_df.shape)
conversation_df.head()

In [None]:
# Save checkpoint
checkpoint['conversation_df'] = conversation_df

### Apply Andalusian transliteration

In [None]:
# Load checkpoint
conversation_df = checkpoint['conversation_df']

In [None]:
# Convert the text to Andalusian Spanish
conversor = AndalusianConversor()
conversation_df['input'] = conversation_df['input'].apply(conversor.convert)
conversation_df['output'] = conversation_df['output'].apply(conversor.convert)
print(f'Final shape: {conversation_df.shape}')

In [None]:
# Test conversor
conversor.convert('Paula tiene un perro llamado Mushu, ¿a que todos amamos a Mushu? ')

In [None]:
print(f'Final shape: {conversation_df.shape}')

In [None]:
# Show results
pd.set_option('display.max_colwidth', None)
conversation_df.sample(5)

In [None]:
# Save checkpoint
checkpoint['andalusian_transcript'] = conversation_df

### Trim maximum token length

In [None]:
# Load checkpoint
temp_df = checkpoint['andalusian_transcript']

In [None]:
# Load tokenizer
base_model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

#### Input length distribution

In [None]:
input_lengths = temp_df['input'].apply(tokenizer.tokenize).apply(len)

# Plot input lengths
fig = px.histogram(input_lengths, title='Input Length Distribution')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Cumulative distribution
fig = px.histogram(input_lengths, 
                   cumulative=True, 
                   title='Input Length Cumulative Distribution', 
                   histnorm='probability',
                   labels = {'value': 'Input Length', 
                             'probability': 'Cumulative Probability',
                             'variable': 'Frequency'})
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Compute percentage of inputs that are longer than n tokens
max_length = 250
mask = input_lengths > max_length
long_inputs = input_lengths[mask]
percentage = len(long_inputs) / len(input_lengths) * 100
print(f'Percentage of inputs longer than {max_length} tokens: {percentage:.2f}%')

#### Output length distribution

In [None]:
# Output length distribution
output_lengths = temp_df['output'].apply(tokenizer.tokenize).apply(len)

# Plot output lengths
fig = px.histogram(output_lengths, title='Output Length Distribution')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Cumulative distribution
fig = px.histogram(output_lengths, 
                   cumulative=True, 
                   title='Output Length Cumulative Distribution', 
                   histnorm='probability',
                   labels = {'value': 'Output Length', 
                             'probability': 'Cumulative Probability',
                             'variable': 'Frequency'})
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Compute percentage of outputs that are longer than n tokens
max_length = 1600
mask = output_lengths > max_length
long_outputs = output_lengths[mask]
percentage = len(long_outputs) / len(output_lengths) * 100
print(f'Percentage of outputs longer than {max_length} tokens: {percentage:.2f}%')

#### Combined distribution

In [None]:
# Set formatting function
def formatting_func(example):
    text = f"### Preƨunʌa: {example['input']}\n ### Γeьpueьʌa: {example['output']}"
    return text

In [None]:
# Apply formatting function
temp_df['formatted'] = temp_df.apply(formatting_func, axis=1)

In [None]:
# Combined length distribution
combined_lengths = temp_df['formatted'].apply(tokenizer.tokenize).apply(len)

# Plot combined lengths
fig = px.histogram(combined_lengths, title='Combined Length Distribution')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Cumulative distribution
fig = px.histogram(combined_lengths, 
                   cumulative=True, 
                   title='Combined Length Cumulative Distribution', 
                #    histnorm='probability',
                   labels = {'value': 'Combined Length', 
                             'probability': 'Cumulative Probability',
                             'variable': 'Frequency'})
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
# Compute percentage of combined lengths that are longer than n tokens
max_length = 1250
mask = combined_lengths > max_length
long_combined = combined_lengths[mask]
percentage = len(long_combined) / len(combined_lengths) * 100
print(f'Percentage of combined lengths longer than {max_length} tokens: {percentage:.2f}% (n = {len(long_combined)})')

In [None]:
# Compute percentage of combined lengths that are shorter than n tokens
max_length = 125
mask = combined_lengths < max_length
short_combined = combined_lengths[mask]
percentage = len(short_combined) / len(combined_lengths) * 100
print(f'Percentage of combined lengths shorter than {max_length} tokens: {percentage:.2f}% (n = {len(short_combined)})')

#### Trim to selected length

In [None]:
max_allowed_length = 1250
print(f'Number of examples before filtering: {len(temp_df)}')
temp_df = temp_df[combined_lengths <= max_allowed_length]
print(f'Number of examples after filtering: {len(temp_df)}')
temp_df.head()

In [None]:
# Save checkpoint
checkpoint['trimmed_transcript'] = temp_df.copy()

### Save results

In [None]:
# Load checkpoint
full_df = checkpoint['trimmed_transcript']

In [None]:
# Retain only needed columns
to_keep = ['input', 'output']
full_df = full_df[to_keep].copy()

#### Full dataset

One of the following modes must be set:
* **preprod**: used for hyperparameter tuning.
* **prod**: used for final model fine-tuning.

It only affects the final file naming, but ensures that the naming conventions are consistent accross the repository.

In [None]:
# Set saving mode
mode = 'prod' # Select either 'preprod' or 'prod'

In [None]:
# Set saving settings
directory = os.path.join('data', 'processed')
filename = f'conversations_2E_ES_AND_{mode}_full.jsonl'
full_path = os.path.join(directory, filename)

In [None]:
# Shuffle dataset
full_df = full_df.sample(frac=1, random_state=33)

In [None]:
# Save as .jsonl
full_df.to_json(full_path, orient='records', lines=True)

#### Train and eval sets

If you plan to track the validation loss during training, it is recommended to set the number of evaluation instances to a fixed low number: since all samples are tested, keeping the usual 20 % of the data for evaluation results in longer training times.

In [None]:
# Set saving settings
n_eval_instances = 1000

In [None]:
# Split dataset
train_df = full_df[:-n_eval_instances]
val_df = full_df[-n_eval_instances:]
print(f'Training set shape: {train_df.shape}')
print(f'Validation set shape: {val_df.shape}')

In [None]:
# Save training set as .jsonl
filename = f'conversations_2E_ES_AND_{mode}_train.jsonl'
filepath = os.path.join(directory, filename)
train_df.to_json(filename, orient='records', lines=True)

In [None]:
# Save validation set as .jsonl
filename = f'conversations_2E_ES_AND_{mode}_val.jsonl'
filepath = os.path.join(directory, filename)
val_df.to_json(filename, orient='records', lines=True)