# Loading the data from HuggingFace

In [None]:
from datasets import load_dataset
dataset = load_dataset("nazimali/kurdish-english-opus-100")
train_20k = dataset["train"].shuffle(seed=42).select(range(20000))



# Loading the base nllb model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# Data Tokenizing function

In [None]:
max_input_length = 128
max_target_length = 128

def tokenize(batch):
    model_inputs = tokenizer(
        batch["english"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length
    )

    labels = tokenizer(
        batch["kurdish"],
        padding="max_length",
        truncation=True,
        max_length=max_target_length
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Applying the Tokenizing function

In [None]:
tokenized_dataset = train_20k.map(
    tokenize,
    batched=True,
    remove_columns=train_20k.column_names
)



# Setting up the lora configurations

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    task_type = "SEQ_2_SEQ_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "out_proj"
    ]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# Importing libraries for training

In [None]:
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    GenerationConfig,
    DataCollatorForSeq2Seq
)


# Setting up the training arguments

In [None]:
gen_config = GenerationConfig.from_pretrained(model_name)

training_args = Seq2SeqTrainingArguments(
    output_dir="nllb_ckb_lora",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    save_total_limit=1,
    save_steps=500,
    generation_config=gen_config
)

# Setting up the data collator and trainer

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

# Training the model

In [None]:
trainer.train()


# Saving the fine tuned model

In [None]:
model.save_pretrained("nllb_kurdish_lora_adapter")
