In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported
from unsloth import PatchDPOTrainer
from trl import ORPOConfig, ORPOTrainer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
dev_file_path = './subtask_3_dev.csv'
train_data_path = './train_data_balanced.csv'

df_dev = pd.read_csv(dev_file_path)
df_train = pd.read_csv(train_data_path)

In [None]:
def replace_abbreviations(df, mapping):
    df['categories'] = df['categories'].replace(mapping, regex=True)
    df['subcategories'] = df['subcategories'].replace(mapping, regex=True)

    return df

abbreviation_mapping = {
        "URW": "Ukraine-Russia War",
        "CC": "Climate Change"
    }


df_train = replace_abbreviations(df_train, abbreviation_mapping)
df_dev = replace_abbreviations(df_dev, abbreviation_mapping)

In [None]:
df_train[:2]

Unnamed: 0,language,filename,categories,subcategories,paragraph,model,explanation,generated,precision,recall,f1
0,BG,BG_670.txt,Ukraine-Russia War: Blaming the war on others ...,Ukraine-Russia War: Blaming the war on others ...,Опитът на колективния Запад да „обезкърви Руси...,meta,"Обвинява се управлението на Запада,че е претър...","""Уилкерсън коментира ситуацията в Украйна и по...",0.861923,0.910642,0.885613
1,BG,BG_1509.txt,Climate Change: Criticism of climate policies,Climate Change: Criticism of climate policies:...,За скъпия ток и жертвите на студа\n\n-4 градус...,gemma,Коментират се негативните последици върху бълг...,"""За скъпия ток и жертвите на студа",0.898388,0.859557,0.878543


In [None]:
ALPACA_PROMPT = """Below is an instruction describing a task, paired with input providing the article's content and its narrative. Write a detailed explanation (at least 80 words) in {language_1} that justifies why the given narrative applies to the article.

### Instruction:
{instruction}

### Input:
Document:
{document}

The main narrative of the article is "{narrative}".

Task:
Analyze the document and provide a detailed explanation (at least 80 words) in {language_2} showing how the narrative is reflected in the document.

### Response:
"""

EOS_TOKEN = "<|endoftext|>"

In [None]:
def formatting_prompts_func(examples, mode="sft"):
    # Language mapping for better clarity in prompts
    language_mapping = {
        "BG": "Bulgarian",
        "EN": "English",
        "HI": "Hindi",
        "PT": "Portuguese"
    }

    # Map languages to full names
    languages = [language_mapping.get(lang, "English") for lang in examples['language']]
    categories = examples['categories']
    subcategories = examples['subcategories']
    documents = examples['paragraph']

    if mode in ["orpo", "dpo"]:
        # Specific to ORPO/DPO modes
        prompts = []
        for document, category, subcategory, language in zip(documents, categories, subcategories, languages):
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category
            instruction = f"Provide an explanation on why the article reflects the given narrative in {language}."
            prompts.append(
                ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=document,
                    narrative=narrative,
                    language_2=language
                )
            )
        examples["prompt"] = prompts
        examples["chosen"] = examples['explanation']
        examples["rejected"] = examples['generated']
        return examples

    elif mode == "sft":
        # Specific to SFT mode
        outputs = examples.get("explanation", [""] * len(documents))
        texts = []
        for document, category, subcategory, language, output in zip(documents, categories, subcategories, languages, outputs):
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category
            instruction = f"Provide an explanation on why the article reflects the given narrative in {language}."
            texts.append(
                ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=document,
                    narrative=narrative,
                    language_2=language,
                    output=output
                ) + EOS_TOKEN
            )
        return {"text": texts}

    else:
        raise ValueError("Invalid mode specified. Choose 'sft', 'orpo', or 'dpo'.")

In [None]:
train_dataset = Dataset.from_pandas(df_train).map(lambda x: formatting_prompts_func(x, mode="sft"), batched=True)
dev_dataset = Dataset.from_pandas(df_dev).map(lambda x: formatting_prompts_func(x, mode="sft"), batched=True)

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [None]:
print(f"Train dataset (SFT): {train_dataset}")
print(f"Dev dataset (SFT): {dev_dataset}")

Train dataset (SFT): Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'model', 'explanation', 'generated', 'precision', 'recall', 'f1', 'text'],
    num_rows: 1012
})
Dev dataset (SFT): Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'explanation', 'text'],
    num_rows: 112
})


In [None]:
orpo_dataset = Dataset.from_pandas(df_train).map(lambda x: formatting_prompts_func(x, mode="orpo"), batched=True)

print(len(orpo_dataset))

print(f"ORPO dataset: {orpo_dataset}")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

1012
ORPO dataset: Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'model', 'explanation', 'generated', 'precision', 'recall', 'f1', 'prompt', 'chosen', 'rejected'],
    num_rows: 1012
})


In [None]:
# orpo_dataset['prompt'][500]

In [None]:
max_seq_length = 2048

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
    device_map = "auto",
    # llm_int8_enable_fp32_cpu_offload = True
    token = 'HF_API_KEY_IRA'
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
trainer_sft = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="sft_outputs",
        report_to="none",
    ),
)

print("Starting SFT training...")
trainer_stats = trainer_sft.train()
trainer_sft.save_model("sft_outputs")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Starting SFT training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,012 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.1012
2,2.1189
3,1.5747
4,2.0123
5,2.0288
6,1.7933
7,1.6966
8,1.9793
9,1.4624
10,1.7738


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [None]:
# Just LoRA adapters
if True: model.save_pretrained_merged("sft_lora_model_adapters ", tokenizer, save_method="lora")
if True: model.push_to_hub_merged("irasalsabila/sft_lora_model_adapters ", tokenizer, save_method="lora", token = 'HF_API_KEY_IRA')

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.
Unsloth: Saving LoRA adapters. Please wait...


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Saved lora model to https://huggingface.co/irasalsabila/lora_model_sft_adapters


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="irasalsabila/sft_lora_model_adapters",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth 2024.12.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
PatchDPOTrainer()
max_seq_length = 2048
orpo_trainer = ORPOTrainer(
    model = model,
    train_dataset = orpo_dataset,
    tokenizer = tokenizer,
    args = ORPOConfig(
        max_length = max_seq_length,
        max_prompt_length = max_seq_length//2,
        max_completion_length = max_seq_length//2,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        beta = 0.1,
        logging_steps = 1,
        optim = "adamw_8bit",
        lr_scheduler_type = "linear",
        max_steps = 30,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        output_dir = "orpo_outputs",
        report_to = "none",
    ),
)

print("Starting ORPO training...")
orpo_trainer.train()
orpo_trainer.save_model("orpo_outputs")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,012 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 24,313,856


Starting DPO training...


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,1.9236,-0.277143,-0.153647,0.0,-0.123496,-1.536465,-2.771425,-0.237027,-0.19608
2,1.7969,-0.298349,-0.186013,0.125,-0.112336,-1.860131,-2.983494,-0.112138,-0.070419
3,1.7712,-0.248786,-0.244897,0.125,-0.003889,-2.448971,-2.487863,0.098191,0.215949
4,1.7506,-0.213126,-0.194136,0.25,-0.01899,-1.941361,-2.13126,0.167579,0.185837
5,1.8118,-0.226196,-0.136083,0.0,-0.090113,-1.360826,-2.261955,0.055904,0.050173
6,1.3237,-0.145827,-0.09327,0.125,-0.052557,-0.932697,-1.458271,0.529247,0.561109
7,1.7096,-0.265886,-0.184944,0.125,-0.080943,-1.849436,-2.658862,0.156296,0.14014
8,1.3711,-0.1431,-0.119266,0.375,-0.023834,-1.192655,-1.430995,0.547147,0.601549
9,1.4922,-0.175451,-0.180875,0.625,0.005423,-1.808746,-1.754515,-0.067116,-0.05086
10,1.6652,-0.265936,-0.162486,0.0,-0.10345,-1.624861,-2.659362,0.361968,0.327738


In [None]:
if True: model.save_pretrained_merged("orpo_lora_model_adapters ", tokenizer, save_method="lora")
if True: model.push_to_hub_merged("irasalsabila/orpo_lora_model_adapters", tokenizer, save_method="lora", token = 'HF_API_KEY_IRA')

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.
Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/583 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Saved lora model to https://huggingface.co/irasalsabila/orpo_lora_model_adapters


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="irasalsabila/orpo_lora_model_adapters",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
import re

def generate_responses(dataset, model, tokenizer, output_file='./predictions.txt', max_retries=3):
    # Language mapping for clarity
    language_mapping = {
        "EN": "English",
        "BG": "Bulgarian",
        "HI": "Hindi",
        "PT": "Portuguese"
    }

    responses = []

    try:
        # Convert dataset to a list of dictionaries if it's a DataFrame
        if isinstance(dataset, pd.DataFrame):
            dataset = dataset.to_dict('records')

        for sample in dataset:
            FastLanguageModel.for_inference(model)

            retries = 0
            valid_response = False

            # Map language code to full name
            language = language_mapping.get(sample.get('language', 'EN'), 'English')
            instruction = "Analyze the document and explain how the given narrative applies."

            # Proper narrative logic
            subcategory = sample.get('subcategories', '')
            category = sample.get('categories', '')
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category

            while not valid_response and retries < max_retries:
                retries += 1
                # Format the prompt
                prompt = ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=sample['paragraph'],
                    narrative=narrative,
                    language_2=language
                )

                # Tokenize the prompt
                inputs = tokenizer(
                    [prompt],
                    return_tensors="pt",
                ).to("cuda")

                # Generate response
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=128,
                    use_cache=True,
                    temperature=0.7 + (0.05 * retries),
                    eos_token_id=tokenizer.eos_token_id
                )

                response = tokenizer.batch_decode(outputs)[0].split(':')[-1].strip()
                response = response.replace(EOS_TOKEN, '').strip()

                if len(response.split()) >= 20:
                    valid_response = True
                else:
                    print(f"Retry {retries} for article: {sample['filename']} - Response too short.")

            if not valid_response:
                narrative_segments = re.split(r'\s*:\s*', narrative)

                explanation_parts = []
                if len(narrative_segments) > 0:
                    explanation_parts.append(f"about {narrative_segments[0].lower()}")
                if len(narrative_segments) > 1:
                    explanation_parts.append(f"{narrative_segments[1].lower()}")
                if len(narrative_segments) > 2:
                    explanation_parts.append(f"and {narrative_segments[2].lower()}")

                response = f"The article is talking {', '.join(explanation_parts)}."

            # Append the response for this article
            responses.append({"article_id": sample['filename'], "generated": response})

        # Save predictions to file
        df_predictions = pd.DataFrame(responses)
        df_predictions.to_csv(output_file, sep='\t', index=False, header=False, encoding='utf-8')
        print(f"Predictions for articles saved: {output_file}")
        return df_predictions

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")
        return None

In [None]:
orpo_predictions = generate_responses(
    dataset=df_dev,
    model=model,
    tokenizer=tokenizer,
    output_file='./orpo_predictions.txt'
)

Retry 1 for article: A9_BG_4076.txt - Response too short.
Retry 2 for article: A9_BG_4076.txt - Response too short.
Retry 3 for article: A9_BG_4076.txt - Response too short.
Retry 1 for article: A9_BG_8427.txt - Response too short.
Retry 2 for article: A9_BG_8427.txt - Response too short.
Retry 3 for article: A9_BG_8427.txt - Response too short.
Retry 1 for article: A9_BG_3819.txt - Response too short.
Retry 2 for article: A9_BG_3819.txt - Response too short.
Retry 3 for article: A9_BG_3819.txt - Response too short.
Retry 1 for article: A9_BG_2592.txt - Response too short.
Retry 2 for article: A9_BG_2592.txt - Response too short.
Retry 3 for article: A9_BG_2592.txt - Response too short.
Retry 1 for article: A9_BG_2828.txt - Response too short.
Retry 2 for article: A9_BG_2828.txt - Response too short.
Retry 3 for article: A9_BG_2828.txt - Response too short.
Retry 1 for article: A9_BG_4093.txt - Response too short.
Retry 2 for article: A9_BG_4093.txt - Response too short.
Retry 3 for ar

In [None]:
orpo_predictions

Unnamed: 0,article_id,generated
0,A9_BG_3970.txt,00 по московскому времени в Москве откроется м...
1,A9_BG_4076.txt,The article is talking about ukraine-russia wa...
2,A9_BG_8427.txt,The article is talking about ukraine-russia wa...
3,A9_BG_3819.txt,The article is talking about ukraine-russia wa...
4,A9_BG_2592.txt,The article is talking about ukraine-russia wa...
...,...,...
107,PT_234.txt,"O que é a ""Amplificação de Medo Climático""?\nA..."
108,PT_207.txt,"O que é a ""sustentabilidade"" e como ela se rel..."
109,PT_217.txt,O que é a cúpula de calor? Entenda fenómeno qu...
110,PT_229.txt,"O que é a ""sensação de perigo"" e como ela pode..."
