In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported
from unsloth import PatchDPOTrainer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
dev_file_path = './subtask_3_dev.csv'
train_data_path = './train_data_balanced.csv'

df_dev = pd.read_csv(dev_file_path)
df_train = pd.read_csv(train_data_path)

In [4]:
def replace_abbreviations(df, mapping):
    df['categories'] = df['categories'].replace(mapping, regex=True)
    df['subcategories'] = df['subcategories'].replace(mapping, regex=True)

    return df

abbreviation_mapping = {
        "URW": "Ukraine-Russia War",
        "CC": "Climate Change"
    }


df_train = replace_abbreviations(df_train, abbreviation_mapping)
df_dev = replace_abbreviations(df_dev, abbreviation_mapping)

In [5]:
ALPACA_PROMPT = """Below is an instruction describing a task, paired with input providing the article's content and its narrative. Write a detailed explanation (at least 80 words) in {language_1} that justifies why the given narrative applies to the article.

### Instruction:
{instruction}

### Input:
Document:
{document}

The main narrative of the article is "{narrative}".

Task:
Analyze the document and provide a detailed explanation (at least 80 words) in {language_2} showing how the narrative is reflected in the document.

### Response:
"""

EOS_TOKEN = "<|endoftext|>"

In [6]:
def formatting_prompts_func(examples, mode="sft"):
    # Language mapping for better clarity in prompts
    language_mapping = {
        "BG": "Bulgarian",
        "EN": "English",
        "HI": "Hindi",
        "PT": "Portuguese"
    }

    # Map languages to full names
    languages = [language_mapping.get(lang, "English") for lang in examples['language']]
    categories = examples['categories']
    subcategories = examples['subcategories']
    documents = examples['paragraph']

    if mode in ["orpo", "dpo"]:
        # Specific to ORPO/DPO modes
        prompts = []
        for document, category, subcategory, language in zip(documents, categories, subcategories, languages):
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category
            instruction = f"Provide an explanation on why the article reflects the given narrative in {language}."
            prompts.append(
                ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=document,
                    narrative=narrative,
                    language_2=language
                )
            )
        examples["prompt"] = prompts
        examples["chosen"] = examples['explanation']
        examples["rejected"] = examples['generated']
        return examples

    elif mode == "sft":
        # Specific to SFT mode
        outputs = examples.get("explanation", [""] * len(documents))
        texts = []
        for document, category, subcategory, language, output in zip(documents, categories, subcategories, languages, outputs):
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category
            instruction = f"Provide an explanation on why the article reflects the given narrative in {language}."
            texts.append(
                ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=document,
                    narrative=narrative,
                    language_2=language,
                    output=output
                ) + EOS_TOKEN
            )
        return {"text": texts}

    else:
        raise ValueError("Invalid mode specified. Choose 'sft', 'orpo', or 'dpo'.")

In [7]:
train_dataset = Dataset.from_pandas(df_train).map(lambda x: formatting_prompts_func(x, mode="sft"), batched=True)
dev_dataset = Dataset.from_pandas(df_dev).map(lambda x: formatting_prompts_func(x, mode="sft"), batched=True)

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [8]:
print(f"Train dataset (SFT): {train_dataset}")
print(f"Dev dataset (SFT): {dev_dataset}")

Train dataset (SFT): Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'model', 'explanation', 'generated', 'precision', 'recall', 'f1', 'text'],
    num_rows: 1012
})
Dev dataset (SFT): Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'explanation', 'text'],
    num_rows: 112
})


In [9]:
dpo_dataset = Dataset.from_pandas(df_train).map(lambda x: formatting_prompts_func(x, mode="dpo"), batched=True)

print(len(dpo_dataset))

print(f"dpo dataset: {dpo_dataset}")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

1012
dpo dataset: Dataset({
    features: ['language', 'filename', 'categories', 'subcategories', 'paragraph', 'model', 'explanation', 'generated', 'precision', 'recall', 'f1', 'prompt', 'chosen', 'rejected'],
    num_rows: 1012
})


In [10]:
max_seq_length = 2048

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
    device_map = "auto",
    # llm_int8_enable_fp32_cpu_offload = True
    token = 'HF_API_KEY_IRA'
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [None]:
trainer_sft = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="sft_outputs",
        report_to="none",
    ),
)

print("Starting SFT training...")
trainer_stats = trainer_sft.train()
trainer_sft.save_model("sft_outputs")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Starting SFT training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,012 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.105
2,2.1357
3,1.5966
4,2.0523
5,2.0607
6,1.8205
7,1.7113
8,1.9937
9,1.489
10,1.8031


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="irasalsabila/sft_lora_model_adapters",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth 2024.12.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
PatchDPOTrainer()

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 2,
        learning_rate = 5e-6,
        max_steps = 200,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "dpo_outputs",
        report_to = "none",
    ),
    beta = 0.1,
    train_dataset = dpo_dataset,
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

print("Starting DPO training...")
dpo_trainer.train()
dpo_trainer.save_model("dpo_outputs")

Extracting prompt from train dataset:   0%|          | 0/1012 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1012 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1012 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Starting DPO training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,012 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 200
 "-____-"     Number of trainable parameters = 24,313,856
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.9714,-1.641005,-1.25936,0.375,-0.381644,-105.20369,-116.936531,-0.047882,-0.168044
2,0.4667,-1.428959,-2.100052,0.875,0.671093,-104.510101,-126.508438,-0.039295,0.027726
3,0.8203,-1.429058,-1.484789,0.625,0.055731,-251.180176,-140.121292,0.106006,0.294256
4,0.4693,-1.080635,-1.75364,1.0,0.673006,-144.885864,-162.643097,0.288661,0.136359
5,0.9638,-1.326253,-0.932947,0.25,-0.393306,-98.395332,-130.468948,0.310458,0.051275
6,1.0015,-1.056353,-0.659392,0.25,-0.396961,-130.966766,-147.299026,0.917548,0.831791
7,0.837,-1.88469,-1.777731,0.5,-0.106959,-176.104706,-168.328629,-0.044241,0.27557
8,0.8071,-0.937725,-0.76361,0.375,-0.174115,-125.123718,-161.611267,0.852432,1.038551
9,0.8237,-1.304362,-1.250274,0.625,-0.054088,-154.106628,-134.506378,0.303803,0.404988
10,0.9575,-1.753181,-1.404653,0.5,-0.348529,-141.649826,-141.992264,0.595211,0.297057


In [None]:
if True: model.save_pretrained_merged("dpo_lora_model_adapters ", tokenizer, save_method="lora")
if True: model.push_to_hub_merged("irasalsabila/dpo_lora_model_adapters", tokenizer, save_method="lora", token = 'HF_API_KEY_IRA')

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.
Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/583 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Saved lora model to https://huggingface.co/irasalsabila/dpo_lora_model_adapters


In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="irasalsabila/dpo_lora_model_adapters",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth 2024.12.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [12]:
import re

def generate_responses(dataset, model, tokenizer, output_file='./predictions.txt', max_retries=3):
    # Language mapping for clarity
    language_mapping = {
        "EN": "English",
        "BG": "Bulgarian",
        "HI": "Hindi",
        "PT": "Portuguese"
    }

    responses = []

    try:
        # Convert dataset to a list of dictionaries if it's a DataFrame
        if isinstance(dataset, pd.DataFrame):
            dataset = dataset.to_dict('records')

        for sample in dataset:
            FastLanguageModel.for_inference(model)

            retries = 0
            valid_response = False

            # Map language code to full name
            language = language_mapping.get(sample.get('language', 'EN'), 'English')
            instruction = "Analyze the document and explain how the given narrative applies."

            # Proper narrative logic
            subcategory = sample.get('subcategories', '')
            category = sample.get('categories', '')
            narrative = subcategory if subcategory and subcategory.lower() != 'none' else category

            while not valid_response and retries < max_retries:
                retries += 1
                # Format the prompt
                prompt = ALPACA_PROMPT.format(
                    language_1=language,
                    instruction=instruction,
                    document=sample['paragraph'],
                    narrative=narrative,
                    language_2=language
                )

                # Tokenize the prompt
                inputs = tokenizer(
                    [prompt],
                    return_tensors="pt",
                ).to("cuda")

                # Generate response
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=128,
                    use_cache=True,
                    temperature=0.7 + (0.05 * retries),
                    eos_token_id=tokenizer.eos_token_id
                )

                response = tokenizer.batch_decode(outputs)[0].split(':')[-1].strip()
                response = response.replace(EOS_TOKEN, '').strip()

                if len(response.split()) >= 20:
                    valid_response = True
                else:
                    print(f"Retry {retries} for article: {sample['filename']} - Response too short.")

            if not valid_response:
                narrative_segments = re.split(r'\s*:\s*', narrative)

                explanation_parts = []
                if len(narrative_segments) > 0:
                    explanation_parts.append(f"about {narrative_segments[0].lower()}")
                if len(narrative_segments) > 1:
                    explanation_parts.append(f"{narrative_segments[1].lower()}")
                if len(narrative_segments) > 2:
                    explanation_parts.append(f"and {narrative_segments[2].lower()}")

                response = f"The article is talking {', '.join(explanation_parts)}."

            # Append the response for this article
            responses.append({"article_id": sample['filename'], "generated": response})

        # Save predictions to file
        df_predictions = pd.DataFrame(responses)
        df_predictions.to_csv(output_file, sep='\t', index=False, header=False, encoding='utf-8')
        print(f"Predictions for articles saved: {output_file}")
        return df_predictions

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")
        return None

In [13]:
dpo_predictions = generate_responses(
    dataset=dev_dataset,
    model=model,
    tokenizer=tokenizer,
    output_file='./dpo_predictions.txt'
)

Retry 1 for article: A9_BG_2592.txt - Response too short.
Retry 2 for article: A9_BG_2592.txt - Response too short.
Retry 3 for article: A9_BG_2592.txt - Response too short.
Retry 1 for article: A9_BG_2828.txt - Response too short.
Retry 2 for article: A9_BG_2828.txt - Response too short.
Retry 3 for article: A9_BG_2828.txt - Response too short.
Retry 1 for article: A9_BG_2756.txt - Response too short.
Retry 2 for article: A9_BG_2756.txt - Response too short.
Retry 3 for article: A9_BG_2756.txt - Response too short.
Retry 1 for article: EN_CC_200070.txt - Response too short.
Retry 2 for article: EN_CC_200070.txt - Response too short.
Retry 3 for article: EN_CC_200070.txt - Response too short.
Retry 1 for article: EN_CC_200049.txt - Response too short.
Retry 2 for article: EN_CC_200049.txt - Response too short.
Retry 3 for article: EN_CC_200049.txt - Response too short.
Retry 1 for article: EN_CC_200036.txt - Response too short.
Retry 2 for article: EN_CC_200036.txt - Response too short

In [None]:
dpo_predictions