In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
model_id = "/data3/ritika_project/mistral-7b"

# The name for our new, fine-tuned model adapter
new_model_name = "mistral-7b-english-to-french"
output_base_dir = "/data3/ritika/mistral_training_output"
# Create the directory if it doesn't exist
os.makedirs(output_base_dir, exist_ok=True)

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model from: {model_id}")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 
print("Base model loaded.")

Loading base model from: /data3/ritika_project/mistral-7b


Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:24<00:00, 12.30s/it]


Base model loaded.


In [4]:
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("Model prepared for QLoRA training.")

Model prepared for QLoRA training.


In [5]:
def load_translation_dataset(en_path, fr_path):
    """
    Loads parallel data assuming line-by-line alignment between the two files.
    This is the corrected version for files without utterance IDs.
    """
    if not os.path.exists(en_path):
        raise FileNotFoundError(f"English data file not found at: {en_path}")
    if not os.path.exists(fr_path):
        raise FileNotFoundError(f"French data file not found at: {fr_path}")

    print(f"Reading English file from {en_path}...")
    with open(en_path, 'r', encoding='utf-8') as f:
        en_lines = [line.strip() for line in f]

    print(f"Reading French file from {fr_path}...")
    with open(fr_path, 'r', encoding='utf-8') as f:
        fr_lines = [line.strip() for line in f]

    if len(en_lines) != len(fr_lines):
        raise ValueError(
            f"The number of lines in the English and French files do not match. "
            f"English: {len(en_lines)}, French: {len(fr_lines)}"
        )

    data = []
    print(f"Creating {len(en_lines)} instruction pairs...")
    for i in range(len(en_lines)):
        en_text = en_lines[i]
        fr_text = fr_lines[i]
        # Ensure that we don't process empty lines
        if en_text and fr_text:
            data.append({
                "instruction": en_text,
                "response": fr_text
            })
            
    return data

In [6]:
def create_prompt_format(sample):
    """Creates a formatted prompt string from a dataset sample."""
    return f"English:\n{sample['instruction']}\nFrench:\n{sample['response']}"

# *** MAKE SURE THESE PATHS ARE CORRECT FOR YOUR SYSTEM ***
train_en_path = "/data3/ritika/data/raw/train_100h_txt/train/train.en"
train_fr_path = "/data3/ritika/data/raw/train_100h_txt/train/train.fr"

# Load the data
training_data = load_translation_dataset(train_en_path, train_fr_path)

# For testing, you might want to use a smaller subset of your data
# training_data = training_data[:1000] # Uncomment to use only the first 1000 samples

print(f"Loaded {len(training_data)} samples.")

# Convert the list of dictionaries to a Hugging Face Dataset
dataset = Dataset.from_list(training_data)

# Create the final formatted dataset with a 'text' column
formatted_dataset = dataset.map(lambda sample: {'text': create_prompt_format(sample)})

Reading English file from /data3/ritika/data/raw/train_100h_txt/train/train.en...
Reading French file from /data3/ritika/data/raw/train_100h_txt/train/train.fr...
Creating 47271 instruction pairs...
Loaded 47271 samples.


Map: 100%|████████████████████████████████████████████████████████████████████| 47271/47271 [00:06<00:00, 7664.16 examples/s]


In [7]:
print(dataset[0])

{'instruction': 'ADIEU VALENTINE ADIEU', 'response': '--Adieu, Valentine, adieu!'}


In [8]:
print(formatted_dataset[0])

{'instruction': 'ADIEU VALENTINE ADIEU', 'response': '--Adieu, Valentine, adieu!', 'text': 'English:\nADIEU VALENTINE ADIEU\nFrench:\n--Adieu, Valentine, adieu!'}


In [9]:
training_args = TrainingArguments(
    output_dir=f"{output_base_dir}/results_translation",
    num_train_epochs=3, # Training for 1 full epoch is a good starting point
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1, # Accumulate gradients over 4 steps
    optim="paged_adamw_32bit",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10, 
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=128, 
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

Map: 100%|█████████████████████████████████████████████████████████████████████| 47271/47271 [01:47<00:00, 437.80 examples/s]


In [None]:
print("Starting training on the custom translation dataset...")
trainer.train(resume_from_checkpoint=True)
print("Training complete.")

Starting training on the custom translation dataset...


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss
2210,1.2627
2220,1.1867
2230,1.1811
2240,1.638
2250,1.3697
2260,1.2589
2270,1.1796
2280,1.1323
2290,1.6351
2300,1.359


In [None]:
final_model_path = f"{output_base_dir}/{new_model_name}"
print(f"Saving fine-tuned model adapter to {final_model_path}")
# *** BEST PRACTICE: Use safe_serialization to save in the better .safetensors format ***
trainer.model.save_pretrained(final_model_path, safe_serialization=True)
print("Model adapter saved.")