In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# I am using this instead of -insctruct because it was the one given in the initial code
model_name = "HuggingFaceTB/SmolLM-135M" 

# TODO: Load the model and the tokenizer from huggingface
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda', use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True,
                                          truncation=True, padding=True,
                                          return_tensors="pt")

# Check special tokens
print(f"EOS token --> {tokenizer.eos_token}")
print(f"BOS token --> {tokenizer.bos_token}")
print(f"PAD token --> {tokenizer.pad_token}")

# Adding EOS token as padding token
tokenizer.pad_token = tokenizer.unk_token
model.config.pad_token_id = tokenizer.pad_token_id
print(f"[UPDATE] PAD token --> {tokenizer.pad_token}")

In [None]:
model.device

In [None]:
# Quick test if your model works properly
def format_text(text: str) -> str:
    # here you may have formatting of the input that you adopted for training
    # The "Fix grammatically" instruction is already in the user prompt so
    # there is no need to add it like we did in training
    text = f"{text} \n ### Correct:"

    return text


# Example of how to run inference on a single example
text = "Fix grammatically: I likes turtles"
# text = "Fix grammaticality: First of all, from you read just to found in the poems or novel what well-known critic have already found out, you looses the pleasures of reading something which is expecting to be a new experience to you."
inputs = tokenizer(format_text(text), return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.0,
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=tokenizer.eos_token_id,
                              )
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)

### Load dataset

In [None]:
from datasets import load_dataset

# Download the GEC data
full_train_ds = load_dataset("grammarly/coedit", split="train")
full_test_ds = load_dataset("grammarly/coedit", split="validation")

# TODO: Filter examples, keeping only GEC task
# Explore the structure of the dataset

print(f'--> Dataset strcutrue: \n {full_train_ds.features}\n')

# Explore the different task in the dataset
print(f'--> Tasks in the dataset {set(full_train_ds["task"])}\n')

train_gec_ds = full_train_ds.filter(lambda example: example['task'] == 'gec', )
test_gec_ds = full_test_ds.filter(lambda example: example['task'] == 'gec')

# Check size of the filter data is correct
assert len(train_gec_ds) == 19823, "Wrong number of train samples"
assert len(test_gec_ds) == 485, "Wrong number of test samples"

train_gec_ds, test_gec_ds

# select a subset of 10 instances for sake of computational limitations
# toy_train_data = train_gec_ds.select(range(10))
# toy_test_data = test_gec_ds.select(range(10))

### Generate Preferences Data

#### Version 1 (slow)

In [None]:
from fast_edit_distance import edit_distance
import random
from tqdm.auto import tqdm

# TODO: Create preference optimization dataset

def generate_variants(model, tokenizer, input_text):
    # Variant 1: Beam search decoding
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True,
                       max_length=128).to(model.device)
    beam_output = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,  # Use beam search with 5 beams
        temperature = 0.0, # Deterministic output
        length_penalty=-1.0,  # Adjust length penalty
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )
    variant_1 = tokenizer.decode(beam_output[0], skip_special_tokens=True)

    # Variant 2: Sampling with temperature
    sampling_output = model.generate(
        **inputs,
        max_length=128,
        temperature=0.9,  # Use temperature-based sampling
        top_k=50,  # Control diversity using top-k sampling
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    variant_2 = tokenizer.decode(sampling_output[0], skip_special_tokens=True)

    return variant_1, variant_2

# Create a preference dataset by comparing edit distance
def create_preference_dataset(model, tokenizer, dataset):
    variant_1_count = 0
    variant_2_count = 0
    preference_data = []

    # Loop over the training dataset
    for example in tqdm(dataset):
        input_text = f"Fix grammatically: {example['src']}"  # Input prompt
        ground_truth = example['tgt']  # Corrected sentence

        # Generate two variants for the input
        variant_1, variant_2 = generate_variants(model, tokenizer, input_text)

        # Measure edit distance between the variants and the ground truth
        dist_variant_1 = edit_distance(variant_1, ground_truth)
        dist_variant_2 = edit_distance(variant_2, ground_truth)

        # Label based on the smaller edit distance
        if dist_variant_1 < dist_variant_2:
            chosen = variant_1
            rejected = variant_2
            variant_1_count += 1
        else:
            chosen = variant_2
            rejected = variant_1
            variant_2_count += 1

        # Add the comparison to the preference dataset
        preference_data.append({
            'input': input_text,
            'ground_truth': ground_truth,
            'variant_1': variant_1,
            'variant_2': variant_2,
            'chosen': chosen,
            'rejected': rejected
        })

    # reporting statistics
    norm_variant_1_count = variant_1_count/len(dataset)
    norm_variant_2_count = variant_2_count/len(dataset)
    print(f"Variant 1 count: {variant_1_count} ({norm_variant_1_count:.2f}%)")
    print(f"Variant 2 count: {variant_2_count} ({norm_variant_2_count:.2f}%)")

    return preference_data



In [None]:
import pandas as pd
toy_train_data = train_gec_ds.select(range(10))


preference_dataset = create_preference_dataset(model, tokenizer, toy_train_data)

# save preference_dataset to parqet to be loaded in pandas
df = pd.DataFrame(preference_dataset)
# df.to_parquet("dpo_preference_dataset.parquet")

#### Version 2 

In [None]:
import time

def timer(func):
    def wrapper(*args, **kwargs):
        # start the timer
        start_time = time.time()
        # call the decorated function
        result = func(*args, **kwargs)
        # remeasure the time
        end_time = time.time()
        # compute the elapsed time and print it
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")
        # return the result of the decorated function execution
        return result
    # return reference to the wrapper function
    return wrapper

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from fast_edit_distance import edit_distance


@timer
def generate_variants_batch(model, tokenizer, input_texts):
    # Tokenize the batch of input texts
    inputs = tokenizer(
        input_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(model.device)

    # Variant 1: Beam search decoding
    beam_outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,  # Use beam search with 5 beams
        temperature=0.0,  # Deterministic output
        length_penalty=-1.0,  # Adjust length penalty
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,  # Return only the best sequence
    )
    variants_1 = tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)

    # Variant 2: Sampling with temperature
    sampling_outputs = model.generate(
        **inputs,
        max_length=128,
        temperature=0.9,  # Use temperature-based sampling
        top_k=50,  # Control diversity using top-k sampling
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
    )
    variants_2 = tokenizer.batch_decode(sampling_outputs, skip_special_tokens=True)

    return variants_1, variants_2


@timer
def create_preference_dataset(model, tokenizer, dataset, batch_size=5):
    variant_1_count = 0
    variant_2_count = 0
    preference_data = []

    # Create a DataLoader for batching
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model.eval()  # Set the model to evaluation mode

    for batch in tqdm(dataloader):
        input_texts = [f"Fix grammatically: {src}" for src in batch['src']]
        ground_truths = batch['tgt']

        # Generate two variants for the batch of inputs
        variants_1, variants_2 = generate_variants_batch(model, tokenizer, input_texts)

        # Measure edit distances between the variants and the ground truths
        distances_1 = [edit_distance(v1, gt) for v1, gt in zip(variants_1, ground_truths)]
        distances_2 = [edit_distance(v2, gt) for v2, gt in zip(variants_2, ground_truths)]

        for i in range(len(input_texts)):
            dist_variant_1 = distances_1[i]
            dist_variant_2 = distances_2[i]
            variant_1 = variants_1[i]
            variant_2 = variants_2[i]
            input_text = input_texts[i]
            ground_truth = ground_truths[i]

            # Label based on the smaller edit distance
            if dist_variant_1 < dist_variant_2:
                chosen = variant_1
                rejected = variant_2
                variant_1_count += 1
            else:
                chosen = variant_2
                rejected = variant_1
                variant_2_count += 1

            # Add the comparison to the preference dataset
            preference_data.append({
                'input': input_text,
                'ground_truth': ground_truth,
                'variant_1': variant_1,
                'variant_2': variant_2,
                'chosen': chosen,
                'rejected': rejected
            })

    # Reporting statistics
    total_examples = len(dataset)
    norm_variant_1_count = variant_1_count / total_examples * 100
    norm_variant_2_count = variant_2_count / total_examples * 100
    time.sleep(5)  # Simulate a long computation
    print(f"Variant 1 chosen: {variant_1_count} ({norm_variant_1_count:.2f}%)")
    print(f"Variant 2 chosen: {variant_2_count} ({norm_variant_2_count:.2f}%)")

    return preference_data


In [None]:
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Huertas97/smollm-gec-sftt", padding_side='left')
best_model = AutoModelForCausalLM.from_pretrained("Huertas97/smollm-gec-sftt",
                                                  device_map='auto',
                                                  use_cache=True)

In [None]:


toy_train_data = train_gec_ds.select(range(1000))


preference_dataset = create_preference_dataset(best_model, tokenizer, toy_train_data)

# save preference_dataset to parqet to be loaded in pandas
df = pd.DataFrame(preference_dataset)
# df.to_parquet("dpo_preference_dataset.parquet")

In [None]:
df.to_parquet("dpo_preference_dataset_1k.parquet")

### Train SFT+DPO

In [1]:
import os
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, load_dataset
import pandas as pd

# TODO: Run Direct Preference Optimization (DPO)


# Create DPO data with the required format with
# 3 entries: prompt, chosen, rejected
def return_prompt_and_responses(samples):
    return {
     "prompt": [
      f"### Input: ```{input}```\n ### Correct: "
      for input in samples["input"]
      ],
      "chosen": samples["chosen"],
      "rejected": samples["rejected"],
    }


# Load the data generated from parquet
dpo_preference_dataset = load_dataset("parquet", data_files={"train": "./dpo_preference_dataset_1k.parquet"})
original_columns = dpo_preference_dataset["train"].column_names

# Apply the formatting
dpo_train_dataset = dpo_preference_dataset.map(
 return_prompt_and_responses,
 batched=True,
 remove_columns=original_columns
)["train"]

dpo_train_dataset = dpo_train_dataset.select(range(10))

In [None]:
# Load the best model from huggingface (in case it was not loaded)
tokenizer = AutoTokenizer.from_pretrained("Huertas97/smollm-gec-sftt")
best_model = AutoModelForCausalLM.from_pretrained("Huertas97/smollm-gec-sftt",
                                                  device_map='auto',
                                                  use_cache=True,
                                                  )

best_model_ref = AutoModelForCausalLM.from_pretrained("Huertas97/smollm-gec-sftt",
                                                  device_map='auto',
                                                  use_cache=True)

### Sweep over hyperparameters

In [2]:
import wandb

def train_dpo():
    # Initialize a new run for WandB
    wandb.init()

    # Access sweep-configured hyperparameters from WandB config
    config = wandb.config

    # Load the sftt trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Huertas97/smollm-gec-sftt", use_fast=True, trust_remote_code=True,
                                              truncation=True, padding=True,
                                              return_tensors="pt")
    best_model = AutoModelForCausalLM.from_pretrained("Huertas97/smollm-gec-sftt",
                                                      device_map='auto',
                                                      use_cache=True,
                                                      )
    # best_model_ref = AutoModelForCausalLM.from_pretrained("Huertas97/smollm-gec-sftt",
    #                                                   device_map='auto',
    #                                                   use_cache=True)


    # Configure DPO with hyperparameters from WandB config
    output_dir_sftt = "smollm-gec-sftt"
    output_dir_dpo = "smollm-gec-sftt" + "-dpo"
    dpo_config = DPOConfig(
        output_dir = output_dir_dpo,
        beta=config.beta,
        learning_rate=config.learning_rate,
        num_train_epochs=config.epochs,
        weight_decay = config.weight_decay,
        lr_scheduler_type = config.lr_scheduler_type,
        loss_type=config.loss_type,
        seed=config.seed,
        per_device_train_batch_size=config.per_device_train_batch_size,
        report_to="none",
    )
    # Initialize the DPOtrainer with the model, datasets, and SFT configuration
    dpo_trainer = DPOTrainer(
        best_model,
        best_model,
        args=dpo_config,
        train_dataset=dpo_train_dataset,
        # eval_dataset=dpo_train_dataset,
        tokenizer=tokenizer,  # for visual language models, use tokenizer=processor instead
    )

    # Start training
    dpo_trainer.train()

    # Log any final metrics (you can log more metrics inside the training loop if needed)
    wandb.log({"final_eval_loss": dpo_trainer.evaluate()["eval_loss"]})

    # Finish the WandB run
    wandb.finish()

In [3]:
import wandb
sweep_config = {
    "method": "bayes",  # You can also use 'grid' or 'bayes'
    "metric": {"name": "final_eval_loss", "goal": "minimize"},
    "parameters": {
        "learning_rate": {
            "values": [5e-5, 3e-5, 1e-4]  # Exploring different learning rates
        },
        "weight_decay": {
            "values": [0.0, 0.01, 0.1]  # Exploring weight decay
        },
        "epochs": {
            "values": [1]
        },
        "gradient_accumulation_steps": {
            "values": [2, 4]  # Exploring gradient accumulation for smaller GPUs
        },
        "beta": {"values": [0.1]}, # Higher beta means less divergence from the initial policy.
        "loss_type": {"values": ["sigmoid", "robust"]},
        "lr_scheduler_type": {"values": ["linear", "cosine"]},
        "seed": {"value": 42},
        "per_device_train_batch_size": {"value": 1},
    }
}

# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="C4AI-Challenge-smollm-sft-dpo")

# Launch the sweep
wandb.agent(sweep_id, function=train_dpo, count=1)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: s2l1bgj5
Sweep URL: https://wandb.ai/huertas_97/C4AI-Challenge-smollm-sft-dpo/sweeps/s2l1bgj5


[34m[1mwandb[0m: Agent Starting Run: ljlo4sqj with config:
[34m[1mwandb[0m: 	beta: 0.1
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	loss_type: sigmoid
[34m[1mwandb[0m: 	lr_scheduler_type: linear
[34m[1mwandb[0m: 	per_device_train_batch_size: 1
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: Currently logged in as: [33mhuertas_97[0m. Use [1m`wandb login --relogin`[0m to force relogin




Tokenizing train dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run ljlo4sqj errored:
Traceback (most recent call last):
  File "/mnt/almacenamiento/miniconda3/envs/NLP_ENV/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/tmp/ipykernel_405241/3470824841.py", line 49, in train_dpo
    dpo_trainer.train()
  File "/mnt/almacenamiento/miniconda3/envs/NLP_ENV/lib/python3.10/site-packages/transformers/trainer.py", line 1859, in train
    return inner_training_loop(
  File "/mnt/almacenamiento/miniconda3/envs/NLP_ENV/lib/python3.10/site-packages/transformers/trainer.py", line 2266, in _inner_training_loop
    self.optimizer.step()
  File "/mnt/almacenamiento/miniconda3/envs/NLP_ENV/lib/python3.10/site-packages/accelerate/optimizer.py", line 172, in step
    self.optimizer.step(closure)
  File "/mnt/almacenamiento/miniconda3/envs/NLP_ENV/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
    return wrapped(*args, **kwargs)
  File "/mnt/almacenamiento/miniconda3/envs/NLP_E

## mSTSB

In [12]:
import pandas as pd

df_mstsb_train = pd.read_pickle("/home/AI_projects/Multilingual-STSB/Data/Multi-STSB-train.pkl")
df_mstsb_train_v1 = pd.read_pickle("/home/AI_projects/Multilingual-STSB/Data/Cross-lingual/Train/mSTSb_train_crosslingual_v1.pkl")

In [11]:
df_mstsb_train.loc[ df_mstsb_train["score"] >= 0.7].groupby("lang").count()

Unnamed: 0_level_0,stsb_train_1,stsb_train_2,score
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ar,2063,2063,2063
cs,2063,2063,2063
de,2063,2063,2063
en,2063,2063,2063
es,2063,2063,2063
fr,2063,2063,2063
hi,2063,2063,2063
it,2063,2063,2063
ja,2063,2063,2063
nl,2063,2063,2063


In [15]:
df_mstsb_train_v1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178219 entries, 0 to 5748
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   task                178219 non-null  object 
 1   stsb_train_1_lang1  178219 non-null  object 
 2   stsb_train_2_lang2  178219 non-null  object 
 3   score_lang2         178219 non-null  float64
dtypes: float64(1), object(3)
memory usage: 6.8+ MB


In [17]:
df_mstsb_train_v1.loc[ df_mstsb_train_v1["score_lang2"] >= 0.7]


Unnamed: 0,task,stsb_train_1_lang1,stsb_train_2_lang2,score_lang2
0,en;en,A plane is taking off.,An air plane is taking off.,1.00
1,en;en,A man is playing a large flute.,A man is playing a flute.,0.76
2,en;en,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,0.76
4,en;en,A man is playing the cello.,A man seated is playing the cello.,0.85
5,en;en,Some men are fighting.,Two men are fighting.,0.85
...,...,...,...,...
5668,en;zh-TW,Tokyo shares open higher over buoyant U.S. market,東京股市開盤走高追踪美國漲幅,0.80
5670,en;zh-TW,Hawaii preps for first hurricane in 22 years,夏威夷成為22年以來的第一場颶風,1.00
5673,en;zh-TW,Tokyo shares open higher on buoyant U.S. market,東京股市在美國強勁表現後開盤走高,0.80
5676,en;zh-TW,Lewis Hamilton Takes Pole for Russian GP,劉易斯·漢密爾頓（Lewis Hamilton）贏得首屆俄羅斯大獎賽,0.80


In [23]:
pd.set_option('display.max_colwidth', None)
df_mstsb_train_v1.sample(10)

Unnamed: 0,task,stsb_train_1_lang1,stsb_train_2_lang2,score_lang2
469,ja;ja,男は椅子に座って見つめていました。,少女は髪をポニーテールに入れています。,0.0
1635,en;ar,A woman in a black dress smiles in front of a silver truck.,فتاة ترتدي قميصًا أسود تبتسم وشاحنة فضية في الخلفية.,0.72
104,en;es,A man is playing the piano.,El hombre toca el violín.,0.35
2990,en;en,Strayhorn said it was the first time in Texas history a comptroller had not certified the appropriations act.,"In a news release Thursday, Strayhorn said this was the first time a comptroller rejected a budget.",0.64
5045,tr;tr,"Çekimler, protestolar Tayland oylamasına gölge düşürdü",Taylandlı protestocular oylama sürecini aksattı,0.72
2989,ru;ru,"Лэй утверждал, что передача документов будет нарушением его прав по Пятой поправке против самооговора.","Лэй отказался передать документы, заявив о своем праве по Пятой поправке против самообвинения.",0.76
132,en;tr,A man and woman are driving down the street in a jeep.,Bir kadın ve bir erkek açık hava bir araçla yolda ilerliyor.,0.8
1043,en;nl,A women laying across two men sitting on a sofa.,Een man en twee vrouwen glimlachen naar de camera terwijl ze op een blauwe bank zitten.,0.44
4921,cs;cs,Americká stávka dronů zabila v Pákistánu 5 lidí,Pákistánská drone stávka zabije až šest,0.6
928,en;nl,A monkey is walking through the water.,Een man speelt op een trompet.,0.0
