In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset
import os 

In [None]:
model_id = "/data3/ritika_project/TowerBase-7B-v0.1"

# The name for our new, fine-tuned model adapter
new_model_name = "towerbase-7b-english-to-french"
output_base_dir = "/data3/ritika_project/towerbase_training_output"
# Create the directory if it doesn't exist
os.makedirs(output_base_dir, exist_ok=True)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model from: {model_id}")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 
print("Base model loaded.")

In [None]:
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model = prepare_model_for_kbit_training(model)

# LoRA configuration (these settings are generally model-agnostic)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("Model prepared for QLoRA training.")

In [None]:
def load_translation_dataset(en_path, fr_path):
    """
    Loads parallel data assuming line-by-line alignment between the two files.
    """
    if not os.path.exists(en_path):
        raise FileNotFoundError(f"English data file not found at: {en_path}")
    if not os.path.exists(fr_path):
        raise FileNotFoundError(f"French data file not found at: {fr_path}")

    print(f"Reading English file from {en_path}...")
    with open(en_path, 'r', encoding='utf-8') as f:
        en_lines = [line.strip() for line in f]

    print(f"Reading French file from {fr_path}...")
    with open(fr_path, 'r', encoding='utf-8') as f:
        fr_lines = [line.strip() for line in f]

    if len(en_lines) != len(fr_lines):
        raise ValueError(
            f"The number of lines in the English and French files do not match. "
            f"English: {len(en_lines)}, French: {len(fr_lines)}"
        )

    data = []
    print(f"Creating {len(en_lines)} instruction pairs...")
    for i in range(len(en_lines)):
        en_text = en_lines[i]
        fr_text = fr_lines[i]
        if en_text and fr_text:
            data.append({
                "instruction": en_text,
                "response": fr_text
            })
            
    return data

In [None]:
def create_prompt_format(sample):
    """Creates a formatted prompt string from a dataset sample."""
    return f"English:\n{sample['instruction']}\nFrench:\n{sample['response']}"

# Your training data paths remain the same
train_en_path = "/data3/ritika/data/raw/train_100h_txt/train/train.en"
train_fr_path = "/data3/ritika/data/raw/train_100h_txt/train/train.fr"

training_data = load_translation_dataset(train_en_path, train_fr_path)
print(f"Loaded {len(training_data)} samples.")

dataset = Dataset.from_list(training_data)
formatted_dataset = dataset.map(lambda sample: {'text': create_prompt_format(sample)})


In [None]:
training_args = TrainingArguments(
    output_dir=f"{output_base_dir}/results_translation",
    per_device_train_batch_size=16, 
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10, 
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    num_train_epochs=3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

In [None]:
print("Starting training on the custom translation dataset...")
# Use resume_from_checkpoint=True if you need to continue a stopped training run
trainer.train(resume_from_checkpoint=True)
print("Training complete.")

In [None]:
final_model_path = f"{output_base_dir}/{new_model_name}"
print(f"Saving fine-tuned model adapter to {final_model_path}")
trainer.model.save_pretrained(final_model_path, safe_serialization=True)
print("Model adapter saved.")