In [None]:
#v4 27 bleu

!pip install bitsandbytes

!pip install evaluate
# Install accelerate first (only needed once)
!pip install -U accelerate transformers datasets sacrebleu sentencepiece -q


In [None]:
#training code 27
import os
import torch
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_scheduler,
    DataCollatorForSeq2Seq,
)
from torch.optim import AdamW
from tqdm.auto import tqdm
import evaluate
import pandas as pd

# ==== Accelerator Setup ====
accelerator = Accelerator(mixed_precision="fp16")

# ==== Paths ====
HF_MODEL = HF_MODEL
ckpt_path = "/kaggle/working/nllb_final/checkpoint.pt"
best_path = "/kaggle/working/nllb_final/best_bleu"
os.makedirs(best_path, exist_ok=True)

# ==== Load tokenizer and model ====
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL)

# ==== BLEU metric ====
bleu = evaluate.load("sacrebleu")

# ==== Data Collator ====
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ==== DataLoaders ====
train_loader = DataLoader(train_dataset_tokenized, collate_fn=collator, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset_tokenized, collate_fn=collator, batch_size=8)
test_loader = DataLoader(test_dataset_tokenized, collate_fn=collator, batch_size=8)

# ==== Optimizer & Scheduler ====
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

# ==== Accelerator Preparation ====
model, optimizer, train_loader, val_loader, test_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, val_loader, test_loader, lr_scheduler
)

# ==== Resume from checkpoint ====
start_epoch = 0
best_bleu = 0.0
if os.path.exists(ckpt_path):
    ckpt = torch.load(ckpt_path, map_location="cpu")
    accelerator.unwrap_model(model).load_state_dict(ckpt["model"])
    optimizer.load_state_dict(ckpt["optimizer"])
    lr_scheduler.load_state_dict(ckpt["scheduler"])
    start_epoch = ckpt["epoch"] + 1
    best_bleu = ckpt["best_bleu"]
    print(f"Resumed from checkpoint at epoch {start_epoch}, BLEU={best_bleu:.2f}")

# ==== Training Progress CSV ====
summary_csv = "/kaggle/working/training_summary.csv"
summary_rows = []

# ==== Training Loop ====
for epoch in range(start_epoch, 3):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

    # ==== Validation ====
    model.eval()
    preds, labels = [], []
    for batch in tqdm(val_loader, desc="Validating"):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
                num_beams=6,
                no_repeat_ngram_size=3,
                repetition_penalty=1.2,
                length_penalty=1.0
            )
        labels_cleaned = torch.where(batch["labels"] == -100, tokenizer.pad_token_id, batch["labels"])
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels_cleaned, skip_special_tokens=True)
        preds.extend(decoded_preds)
        labels.extend(decoded_labels)
    bleu_score = bleu.compute(predictions=preds, references=[[l] for l in labels])["score"]
    print(f"Epoch {epoch + 1} BLEU: {bleu_score:.2f}")

    # Save progress to CSV
    summary_rows.append({"epoch": epoch+1, "train_loss": avg_loss, "val_bleu": bleu_score})
    pd.DataFrame(summary_rows).to_csv(summary_csv, index=False)

    # ==== Save best model ====
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        accelerator.unwrap_model(model).save_pretrained(best_path)
        tokenizer.save_pretrained(best_path)
        print(" New best model saved.")

    # ==== Save resume checkpoint ====
    torch.save({
        "model": accelerator.unwrap_model(model).state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": lr_scheduler.state_dict(),
        "epoch": epoch,
        "best_bleu": best_bleu,
    }, ckpt_path)

# ==== Final Test ====
model.eval()
preds, labels = [], []
for batch in tqdm(test_loader, desc="Testing"):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
            num_beams=6,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2,
            length_penalty=1.0
        )
    labels_cleaned = torch.where(batch["labels"] == -100, tokenizer.pad_token_id, batch["labels"])
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_cleaned, skip_special_tokens=True)
    preds.extend(decoded_preds)
    labels.extend(decoded_labels)

test_bleu = bleu.compute(predictions=preds, references=[[l] for l in labels])["score"]
print(f" Final Test BLEU: {test_bleu:.2f}")


In [None]:
import pandas as pd
dataset = pd.read_csv('/kaggle/input/hindi2kangri-v2/h2k_cleaned - Sheet1 (2).tsv', sep='\t', usecols=[0, 1])
print(dataset.shape)
print(dataset.columns)
dataset.sample(10)

In [None]:
# Define file paths


# Remove extra spaces & newlines
hindi_lines = dataset['hindi']
kangri_lines =  dataset['kangri']

# Check if both files have equal lines
if len(hindi_lines) != len(kangri_lines):
    raise ValueError(f" ERROR: Hindi ({len(hindi_lines)}) and Kangri ({len(kangri_lines)}) files have different lengths!")

print(f" Successfully loaded {len(hindi_lines)} and Kangri ({len(kangri_lines)})  sentence pairs!")
print(f" Hindi Example: {hindi_lines[0]}")
print(f" Kangri Example: {kangri_lines[0]}")

import pandas as pd
from datasets import Dataset

# Create Pandas DataFrame
data_dict = {"hindi": hindi_lines, "kangri": kangri_lines}
df = pd.DataFrame(data_dict)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df)

print(" Data successfully converted to HF Dataset!")

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Create DataFrame
df = pd.DataFrame({"hindi": hindi_lines, "kangri": kangri_lines})

# Split data (80% train, 10% validation, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    df["hindi"], df["kangri"], test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)
import pandas as pd
import numpy as np

def clean(series):
    return (
        series.fillna("")          
              .astype(str)         
              .tolist()            #
    )

# If X_train, y_train are pandas Series:
X_train = clean(X_train)
y_train = clean(y_train)
X_val   = clean(X_val)
y_val   = clean(y_val)
X_test  = clean(X_test)
y_test  = clean(y_test)

# If they're plain lists, do:
# X_train = ["" if x is None or (isinstance(x, float) and np.isnan(x)) else str(x) for x in X_train]

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"hindi": X_train, "kangri": y_train})
val_dataset = Dataset.from_dict({"hindi": X_val, "kangri": y_val})
test_dataset = Dataset.from_dict({"hindi": X_test, "kangri": y_test})

# Print dataset sizes
print(f" Train: {len(train_dataset)} | Validation: {len(val_dataset)} | Test: {len(test_dataset)}")


In [None]:
import os
import torch
from accelerate import Accelerator
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_scheduler,
    DataCollatorForSeq2Seq,
)
accelerator = Accelerator(mixed_precision="fp16")

# ==== Paths ====
HF_MODEL = "cloghost/nllb-hin-kangri-v2"
ckpt_path = "/kaggle/working/nllb_final/checkpoint.pt"
best_path = "/kaggle/working/nllb_final/best_bleu"
os.makedirs(best_path, exist_ok=True)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

HF_MODEL = "cloghost/nllb-kang2.2"  # your uploaded model

model = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL)
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)

In [None]:
from huggingface_hub import hf_hub_download
import torch

# Downloads and caches the checkpoint file from your repo
ckpt_path = hf_hub_download(
    repo_id="cloghost/nllb-kang2.2",  # replace with your repo
    filename="checkpoint.pt",         # name used during upload
    repo_type="model"
)

# Load it into memory
checkpoint = torch.load(ckpt_path, map_location="cpu")


In [None]:
import torch
import numpy as np
import random
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback,
    get_cosine_schedule_with_warmup
)
from datasets import Dataset, concatenate_datasets
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# ===============================
# 1. DATA AUGMENTATION
# ===============================

class DataAugmenter:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def synonym_replacement(self, text, prob=0.1):
        """Simple synonym replacement for data augmentation"""
        # Hindi common word replacements
        hindi_synonyms = {
            'और': ['तथा', 'एवं'],
            'में': ['के अंदर', 'के भीतर'],
            'के लिए': ['हेतु', 'के वास्ते'],
            'बहुत': ['अत्यधिक', 'काफी'],
            'अच्छा': ['बेहतर', 'उत्तम'],
            'बड़ा': ['विशाल', 'बृहत्'],
            'छोटा': ['लघु', 'सूक्ष्म'],
            'है': ['हैं', 'होता है'],
            'गया': ['गई', 'हुआ'],
            'किया': ['की', 'करना'],
        }
        
        words = text.split()
        augmented_words = []
        
        for word in words:
            if word in hindi_synonyms and random.random() < prob:
                augmented_words.append(random.choice(hindi_synonyms[word]))
            else:
                augmented_words.append(word)
        
        return ' '.join(augmented_words)
    
    def random_insertion(self, text, prob=0.05):
        """Randomly insert common Hindi words"""
        common_words = ['भी', 'तो', 'ही', 'या', 'कि', 'जो', 'से', 'को']
        words = text.split()
        
        for _ in range(int(len(words) * prob)):
            if len(words) > 0:
                random_word = random.choice(common_words)
                random_idx = random.randint(0, len(words))
                words.insert(random_idx, random_word)
        
        return ' '.join(words)
    
    def random_deletion(self, text, prob=0.05):
        """Randomly delete words"""
        words = text.split()
        if len(words) <= 2:
            return text
        
        new_words = []
        for word in words:
            if random.random() > prob:
                new_words.append(word)
        
        return ' '.join(new_words) if new_words else text
    
    def augment_text(self, text):
        """Apply multiple augmentation techniques"""
        augmented = text
        
        # Apply augmentations with different probabilities
        if random.random() < 0.3:
            augmented = self.synonym_replacement(augmented)
        if random.random() < 0.2:
            augmented = self.random_insertion(augmented)
        if random.random() < 0.1:
            augmented = self.random_deletion(augmented)
        
        return augmented
    
    def augment_dataset(self, dataset, augmentation_factor=1.5):
        """Augment the entire dataset"""
        original_size = len(dataset)
        target_size = int(original_size * augmentation_factor)
        additional_samples = target_size - original_size
        
        print(f"Augmenting dataset from {original_size} to {target_size} samples...")
        
        # Create augmented samples
        augmented_data = {
            'hindi': [],
            'kangri': []
        }
        
        for i in range(additional_samples):
            # Randomly select a sample to augment
            idx = random.randint(0, original_size - 1)
            original_sample = dataset[idx]
            
            # Augment both source and target
            aug_hindi = self.augment_text(original_sample['hindi'])
            aug_kangri = self.augment_text(original_sample['kangri'])
            
            augmented_data['hindi'].append(aug_hindi)
            augmented_data['kangri'].append(aug_kangri)
        
        # Create augmented dataset
        augmented_dataset = Dataset.from_dict(augmented_data)
        
        # Combine original and augmented
        combined_dataset = concatenate_datasets([dataset, augmented_dataset])
        
        print("fDataset augmentation complete! New size: ",{len(combined_dataset)})
        return combined_dataset

# ===============================
# 2. OVERFITTING MONITORING
# ===============================

class OverfittingMonitor:
    def __init__(self, patience=5, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.wait = 0
        self.training_losses = []
        self.validation_losses = []
        self.learning_rates = []
        
    def update(self, train_loss, val_loss, lr):
        """Update monitoring metrics"""
        self.training_losses.append(train_loss)
        self.validation_losses.append(val_loss)
        self.learning_rates.append(lr)
        
        # Check for overfitting
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.wait = 0
        else:
            self.wait += 1
            
        return self.wait >= self.patience
    
    def plot_metrics(self):
        """Plot training metrics"""
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
        
        # Loss curves
        ax1.plot(self.training_losses, label='Training Loss', color='blue')
        ax1.plot(self.validation_losses, label='Validation Loss', color='red')
        ax1.set_title('Training vs Validation Loss')
        ax1.set_xlabel('Steps')
        ax1.set_ylabel('Loss')
        ax1.legend()
        ax1.grid(True)
        
        # Learning rate
        ax2.plot(self.learning_rates, label='Learning Rate', color='green')
        ax2.set_title('Learning Rate Schedule')
        ax2.set_xlabel('Steps')
        ax2.set_ylabel('Learning Rate')
        ax2.legend()
        ax2.grid(True)
        
        # Overfitting detection
        if len(self.training_losses) > 10:
            train_trend = np.polyfit(range(len(self.training_losses)), self.training_losses, 1)[0]
            val_trend = np.polyfit(range(len(self.validation_losses)), self.validation_losses, 1)[0]
            
            ax3.bar(['Training', 'Validation'], [train_trend, val_trend], 
                   color=['blue', 'red'], alpha=0.7)
            ax3.set_title('Loss Trends (Negative = Decreasing)')
            ax3.set_ylabel('Trend Slope')
            ax3.grid(True)
        
        # Gap between train and val loss
        loss_gap = np.array(self.validation_losses) - np.array(self.training_losses)
        ax4.plot(loss_gap, label='Val Loss - Train Loss', color='purple')
        ax4.set_title('Overfitting Gap')
        ax4.set_xlabel('Steps')
        ax4.set_ylabel('Loss Difference')
        ax4.legend()
        ax4.grid(True)
        
        plt.tight_layout()
        plt.savefig('/kaggle/working/training_metrics.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def get_overfitting_status(self):
        """Analyze overfitting status"""
        if len(self.training_losses) < 10:
            return "Insufficient data for analysis"
        
        recent_train = np.mean(self.training_losses[-5:])
        recent_val = np.mean(self.validation_losses[-5:])
        gap = recent_val - recent_train
        
        if gap > 0.1:
            return  "High overfitting detected!"
        elif gap > 0.05:
            return  "Moderate overfitting"
        else:
            return  "No significant overfitting"

# ===============================
# 3. CUSTOM TRAINER WITH MONITORING
# ===============================
class MonitoredTrainer(Trainer):
    def __init__(self, *args, overfitting_monitor=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.overfitting_monitor = overfitting_monitor

    def log(self, logs, *args, **kwargs):  # ✅ Accept variable args
        super().log(logs, *args, **kwargs)  # ✅ Pass everything along

        if self.overfitting_monitor and 'eval_loss' in logs:
            train_loss = logs.get('train_loss', logs.get('loss', 0))
            val_loss = logs['eval_loss']
            lr = logs.get('learning_rate', 0)

            is_overfitting = self.overfitting_monitor.update(train_loss, val_loss, lr)

            if is_overfitting:
                print(f"Overfitting detected at step {self.state.global_step}")
                print(f"Status: {self.overfitting_monitor.get_overfitting_status()}")



# ===============================
# 4. MAIN TRAINING SETUP
# ===============================

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)

# NLLB language codes
NLLB_LANG_CODES = {
    'hindi': 'hin_Deva',
    'kangri': 'kang_Deva',
}

tokenizer.src_lang = NLLB_LANG_CODES['hindi']
tokenizer.tgt_lang = NLLB_LANG_CODES['kangri']

# Tokenization function
def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["hindi"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["kangri"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors=None
        )
    
    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    
    return model_inputs

# Data augmentation
print ("Starting data augmentation...")
augmenter = DataAugmenter(tokenizer)

# Apply augmentation only to training data
train_dataset_augmented = augmenter.augment_dataset(train_dataset, augmentation_factor=1.3)
print(f"{train_dataset_augmented}")
# Tokenize all datasets AFTER augmentation
print("Tokenizing datasets...")
train_dataset_tokenized = train_dataset_augmented.map(tokenize_function, batched=True)
val_dataset_tokenized = val_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)

# Remove original string fields (optional but safer)
columns_to_remove = ['hindi', 'kangri']
train_dataset_tokenized = train_dataset_tokenized.remove_columns(columns_to_remove)
val_dataset_tokenized = val_dataset_tokenized.remove_columns(columns_to_remove)
test_dataset_tokenized = test_dataset_tokenized.remove_columns(columns_to_remove)


def mask_labels(example):
    example["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in example["labels"]
    ]
    return example

train_dataset_tokenized = train_dataset_tokenized.map(mask_labels)
val_dataset_tokenized = val_dataset_tokenized.map(mask_labels)
test_dataset_tokenized = test_dataset_tokenized.map(mask_labels)


In [None]:
import os
import torch
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_scheduler,
    DataCollatorForSeq2Seq,
)
from torch.optim import AdamW
from tqdm.auto import tqdm
import evaluate


accelerator = Accelerator(mixed_precision="fp16")

# ==== Paths ====
HF_MODEL = "cloghost/nllb-kang2.2"
ckpt_path = "/kaggle/working/nllb_final/checkpoint.pt"
best_path = "/kaggle/working/nllb_final/best_bleu"
os.makedirs(best_path, exist_ok=True)

# ==== Load tokenizer and model ====
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL)
# ==== BLEU metric ====
bleu = evaluate.load("sacrebleu")

# ==== Data Collator ====
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ==== DataLoaders ====
train_loader = DataLoader(train_dataset_tokenized, collate_fn=collator, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset_tokenized, collate_fn=collator, batch_size=8)
test_loader = DataLoader(test_dataset_tokenized, collate_fn=collator, batch_size=8)

# ==== Optimizer & Scheduler ====
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

# ==== Accelerator Preparation ====
model, optimizer, train_loader, val_loader, test_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, val_loader, test_loader, lr_scheduler
)

# ==== Resume from checkpoint ====
start_epoch = 0
best_bleu = 0.0
if os.path.exists(ckpt_path):
    ckpt = checkpoint #torch.load(ckpt_path, map_location="cpu")
    accelerator.unwrap_model(model).load_state_dict(ckpt["model"])
    optimizer.load_state_dict(ckpt["optimizer"])
    lr_scheduler.load_state_dict(ckpt["scheduler"])
    start_epoch = ckpt["epoch"] + 1
    best_bleu = ckpt["best_bleu"]
    print(f"Resumed from checkpoint at epoch {start_epoch}, BLEU={best_bleu:.2f}")

# ==== Training Loop ====
for epoch in range(start_epoch, 3):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    print(f"Epoch {epoch+1} Avg Loss: {total_loss / len(train_loader):.4f}")

    # ==== Validation ====
    model.eval()
    preds, labels = [], []
    for batch in tqdm(val_loader, desc="Validating"):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
                num_beams=4
            )
        labels_cleaned = torch.where(batch["labels"] == -100, tokenizer.pad_token_id, batch["labels"])
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels_cleaned, skip_special_tokens=True)
        preds.extend(decoded_preds)
        labels.extend(decoded_labels)
    bleu_score = bleu.compute(predictions=preds, references=[[l] for l in labels])["score"]
    print(f"Epoch {epoch + 1} BLEU: {bleu_score:.2f}")

    # ==== Save best model ====
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        accelerator.unwrap_model(model).save_pretrained(best_path)
        tokenizer.save_pretrained(best_path)
        print("✅ New best model saved.")

    # ==== Save resume checkpoint ====
    torch.save({
        "model": accelerator.unwrap_model(model).state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": lr_scheduler.state_dict(),
        "epoch": epoch,
        "best_bleu": best_bleu,
    }, ckpt_path)

# ==== Final Test ====
model.eval()
preds, labels = [], []
for batch in tqdm(test_loader, desc="Testing"):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model).generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
            num_beams=4
        )
    labels_cleaned = torch.where(batch["labels"] == -100, tokenizer.pad_token_id, batch["labels"])
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_cleaned, skip_special_tokens=True)
    preds.extend(decoded_preds)
    labels.extend(decoded_labels)

test_bleu = bleu.compute(predictions=preds, references=[[l] for l in labels])["score"]
print(f"✅ Final Test BLEU: {test_bleu:.2f}")


In [None]:
torch.cuda.reset_peak_memory_stats()


In [None]:
print("hello")

In [None]:

from huggingface_hub import login

HF_TOKEN = "hf_token"  # replace with your actual token
login(token=HF_TOKEN)


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

FINAL_DIR = "/kaggle/working/nllb_final/best_bleu"
model      = AutoModelForSeq2SeqLM.from_pretrained(FINAL_DIR)
tokenizer  = AutoTokenizer.from_pretrained(FINAL_DIR)

model.push_to_hub("cloghost/nllb-kang2.2")
tokenizer.push_to_hub("cloghost/nllb-kang2.2")


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="/kaggle/working/nllb_final/checkpoint.pt",
    path_in_repo="checkpoint.pt",
    repo_id="cloghost/nllb-kang2.2",
    repo_type="model",
    token='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'  # Replace with your token or load from environment
)


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, Dataset
from tqdm import tqdm
import evaluate
import pandas as pd

# === Model IDs or Paths ===
model_versions = {
    "v1": "cloghost/nllb-hin-kangri-v1",
    "v2": "cloghost/nllb-hin-kangri-v2",
    "v3": "/kaggle/working/nllb_final/best_bleu",  # change if needed
}

# === Language Codes ===
SRC_LANG = "hin_Deva"
TGT_LANG = "kang_Deva"

# === Sample Evaluation Data ===
# Replace this with your own test dataset if needed
eval_data = [
    {"hindi": "भारत एक महान देश है।", "kangri": "भारत एक महान देस है।"},
    {"hindi": "मुझे खाना बनाना पसंद है।", "kangri": "मैंनू रंधण चंगा लगदा है।"},
    {"hindi": "आज मौसम बहुत सुहाना है।", "kangri": "आज दा मौसम बड्डा चंगा है।"},
    {"hindi": "आपका नाम क्या है?", "kangri": "तेरै नां की है?"},
    {"hindi": "मैं कल दिल्ली जाऊंगा।", "kangri": "मैं कल दिल्ली जावांगा।"},
]

dataset = Dataset.from_list(eval_data)
bleu = evaluate.load("sacrebleu")

# === Evaluation Function ===
def evaluate_model(model_id, dataset, src_lang=SRC_LANG, tgt_lang=TGT_LANG):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer.src_lang = src_lang
    model.eval()

    preds = []
    refs = []
    rows = []

    for sample in tqdm(dataset, desc=f"Evaluating {model_id}"):
        input_text = sample["hindi"]
        ref_text = sample["kangri"]

        # Tokenize input
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
                max_length=128,
                num_beams=4
            )

        pred_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        preds.append(pred_text)
        refs.append([ref_text])  # sacreBLEU expects list of references
        rows.append({
            "input": input_text,
            "reference": ref_text,
            "prediction": pred_text
        })

    bleu_score = bleu.compute(predictions=preds, references=refs)["score"]
    print(f"\nBLEU score for {model_id}: {bleu_score:.2f}")
    return bleu_score, pd.DataFrame(rows)

# === Compare All Models ===
all_results = {}
for name, model_path in model_versions.items():
    score, df = evaluate_model(model_path, dataset)
    all_results[name] = {"bleu": score, "df": df}
    df.to_csv(f"/kaggle/working/{name}_eval.csv", index=False)

# === Show Comparison Summary ===
print("\n🟢 Model Comparison Summary:")
for name, res in all_results.items():
    print(f"{name}: BLEU = {res['bleu']:.2f}")

# === Optional: Combine into one table ===
comparison_table = dataset.to_pandas()
for name, res in all_results.items():
    comparison_table[name] = res["df"]["prediction"]

comparison_table.to_csv("/kaggle/working/comparison_table.csv", index=False)
print("\n📄 Full comparison saved to: /kaggle/working/comparison_table.csv")
