<a href="https://colab.research.google.com/github/jsl5710/greenland/blob/main/GREENLAND_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Setup & Installation

In [1]:
# Install and upgrade necessary libraries
!pip install --quiet --upgrade pip
!pip install --quiet --upgrade transformers
!pip install --quiet --upgrade datasets
!pip install --quiet --upgrade wandb
!pip install --quiet git+https://github.com/huggingface/peft.git peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m117.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25h

# Step 2: Import Libraries

In [2]:
import os
import torch
import wandb
import pandas as pd
from typing import Dict, Optional
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import (
    LoraConfig,
    PrefixTuningConfig,
    PromptTuningConfig,
    AdaLoraConfig,
    IA3Config,
    get_peft_model,
    PeftModel,
    TaskType,
    PeftConfig
)
from datasets import load_dataset, Dataset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from peft import get_peft_model, LoraConfig, TaskType, AutoPeftModelForSequenceClassification
from google.colab import drive
from requests.exceptions import HTTPError

# Step 3: Define Model Checkpoints

In [3]:
model_checkpoints = {
    "MBERT_uncased": {
        "path": "google-bert/bert-base-multilingual-uncased",
        "max_length": 512
    },
    # "XLM_100": {
    #     "path": "FacebookAI/xlm-mlm-100-1280",
    #     "max_length": 512
    # },
    # "XLM_17": {
    #     "path": "FacebookAI/xlm-mlm-17-1280",
    #     "max_length": 512
    # },
    # "XLM-RoBERTa_xxl": {
    #     "path": "facebook/xlm-roberta-xxl",
    #     "max_length": 512
    # },
    # "mDeBERTa_v3_base": {
    #     "path": "microsoft/mdeberta-v3-base",
    #     "max_length": 512
    # },
    # "S-BERT_LaBSE": {
    #     "path": "sentence-transformers/LaBSE",
    #     "max_length": 512
    # },
    # "S-BERT_distiluse": {
    #     "path": "sentence-transformers/distiluse-base-multilingual-cased",
    #     "max_length": 512
    # },
    # "XLM-R_bernice": {
    #     "path": "jhu-clsp/bernice",
    #     "max_length": 512
    # },
    # "XLM-T_twitter": {
    #     "path": "cardiffnlp/twitter-xlm-roberta-base",
    #     "max_length": 512
    # },
    # "XLM-E_align": {
    #     "path": "microsoft/xlm-align-base",
    #     "max_length": 512
    # },
    # "XLM-E_infoxlm_large": {
    #     "path": "microsoft/infoxlm-large",
    #     "max_length": 512
    # },
    # "XLM-V_base": {
    #     "path": "facebook/xlm-v-base",
    #     "max_length": 512
    # }
}


# model_checkpoints = {
#     "MBERT_uncased": "google-bert/bert-base-multilingual-uncased",
#     # "MBERT_cased": "google-bert/bert-base-multilingual-cased",
#     "XLM_100": "FacebookAI/xlm-mlm-100-1280",
#     "XLM_17": "FacebookAI/xlm-mlm-17-1280",
#     # "XLM-RoBERTa_large": "FacebookAI/xlm-roberta-large",
#     # "XLM-RoBERTa_base": "FacebookAI/xlm-roberta-base",
#     # "XLM-RoBERTa_xl": "facebook/xlm-roberta-xl",
#     "XLM-RoBERTa_xxl": "facebook/xlm-roberta-xxl",
#     "mDeBERTa_v3_base": "microsoft/mdeberta-v3-base",
#     # "M-distilBERT": "distilbert/distilbert-base-multilingual-cased",
#     "S-BERT_LaBSE": "sentence-transformers/LaBSE",
#     "S-BERT_distiluse": "sentence-transformers/distiluse-base-multilingual-cased",
#     "XLM-R_bernice": "jhu-clsp/bernice",
#     "XLM-T_twitter": "cardiffnlp/twitter-xlm-roberta-base",
#     "XLM-E_align": "microsoft/xlm-align-base",
#     # "XLM-E_infoxlm_base": "microsoft/infoxlm-base",
#     "XLM-E_infoxlm_large": "microsoft/infoxlm-large",
#     "XLM-V_base": "facebook/xlm-v-base"
# }

# model_checkpoints = {
    # "MBERT_uncased": "google-bert/bert-base-multilingual-uncased",
    # "XLM_100": "FacebookAI/xlm-mlm-100-1280",
    # "XLM_17": "FacebookAI/xlm-mlm-17-1280",
    # "XLM-RoBERTa_xxl": "facebook/xlm-roberta-xxl",
    # "mDeBERTa_v3_base": "microsoft/mdeberta-v3-base",
    # "S-BERT_LaBSE": "sentence-transformers/LaBSE",
    # "S-BERT_distiluse": "sentence-transformers/distiluse-base-multilingual-cased",
    # "XLM-R_bernice": "jhu-clsp/bernice",
    # "XLM-T_twitter": "cardiffnlp/twitter-xlm-roberta-base",
    # "XLM-E_align": "microsoft/xlm-align-base",
    #     "XLM-E_infoxlm_large": "microsoft/infoxlm-large",
    # "XLM-V_base": "facebook/xlm-v-base"
# }



# Step 4: Authenticate and Initialize

In [4]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Authenticate with Hugging Face
!huggingface-cli login --token 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

# Authenticate with W&B
wandb.login(key="1b5caf38a8b6ada0e6918798e9379b2ea764062d")
wandb.init(project="greenland")


Mounted at /content/drive
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `greenland` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `greenland`


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjasonsamlucas[0m ([33mpike[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Step 5: Define Save Paths and Ensure Directories Exist

In [5]:
# Define save locations
local_save_path = "/content/sample_data/best_models/"
drive_save_path = "/content/drive/MyDrive/GREENLAND/Modeling/Best_models/"
results_dir = "/content/drive/MyDrive/GREENLAND/Results/"

# Ensure save directories exist
os.makedirs(local_save_path, exist_ok=True)
os.makedirs(drive_save_path, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)


# Step 6: Load and Process the Dataset

In [6]:
# # Load datasets from CSV files in Google Drive
# train_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/train_data.csv')
# val_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/val_data.csv')
# test_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/test_data.csv')

# # Convert to Hugging Face Dataset format
# train_dataset = Dataset.from_pandas(train_df)
# val_dataset = Dataset.from_pandas(val_df)
# test_dataset = Dataset.from_pandas(test_df)


# # Combine datasets into a dictionary for easy access
# dataset = {
#     "train": train_dataset,
#     "validation": val_dataset,
#     "test": test_dataset
# }


# Load datasets from CSV files in Google Drive
train_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/GREENLAND/Datasets/Consolidated_Data/Experiment_Training_Splits/test_data.csv')

# Sample 1000 examples from each dataset with random seed for reproducibility
train_df_sampled = train_df.sample(n=1000, random_state=42)
val_df_sampled = val_df.sample(n=1000, random_state=42)
test_df_sampled = test_df.sample(n=1000, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df_sampled)
val_dataset = Dataset.from_pandas(val_df_sampled)
test_dataset = Dataset.from_pandas(test_df_sampled)

# Combine datasets into a dictionary for easy access
dataset = {
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
}

print("Dataset sizes after sampling:")
print(f"Train: {len(train_dataset)}")
print(f"Validation: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")

Dataset sizes after sampling:
Train: 1000
Validation: 1000
Test: 1000


# Step 7: Define Dataset Processing Functions

In [20]:
def verify_dataset(dataset):
    print("\nDataset Verification:")
    for split in dataset.keys():
        print(f"\n{split.capitalize()} set:")
        print("Number of examples:", len(dataset[split]))
        print("Features:", dataset[split].features)
        print("Sample labels:", dataset[split]["label"][:5])
        # Check data type differently
        print("Label type:", type(dataset[split]["label"]))
        # Print first few label types to understand the structure
        print("Sample label types:", [type(label) for label in dataset[split]["label"][:5]])
    return True

def tokenize_datasets(model_name, dataset):
    model_info = model_checkpoints[model_name]
    tokenizer = AutoTokenizer.from_pretrained(model_info["path"])
    max_length = model_info["max_length"]

    print(f"Using max_length={max_length} for model {model_name}")

    def preprocess_function(examples):
        # Convert boolean labels to integers, handling list input
        labels = [int(label) if isinstance(label, bool) else int(bool(label))
                 for label in examples["label"]]

        # Tokenize the text
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors=None
        )

        # Add converted labels to the tokenized output
        tokenized["labels"] = labels
        return tokenized

    # Print sample of data before tokenization
    print("\nBefore tokenization:")
    print("Sample of original labels:", dataset["train"]["label"][:5])
    print("Original label type:", type(dataset["train"]["label"][0]))

    tokenized_data = {
        split: data.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            num_proc=4,
            remove_columns=data.column_names,
            desc=f"Tokenizing {split} set"
        )
        for split, data in dataset.items()
    }

    # Verify the processed labels
    print("\nAfter tokenization:")
    print("Sample of processed labels:", tokenized_data["train"]["labels"][:5])
    print("Processed label type:", type(tokenized_data["train"]["labels"][0]))

    return tokenized_data

def analyze_text_lengths(dataset):
    """
    Analyze text lengths in the dataset without tokenization first
    """
    # Get raw text lengths
    lengths = [len(text.split()) for text in dataset["train"]["text"]]

    stats = {
        "average_length": sum(lengths)/len(lengths),
        "max_length": max(lengths),
        "median_length": sorted(lengths)[len(lengths)//2],
        "95th_percentile": sorted(lengths)[int(len(lengths)*0.95)],
        "length_distribution": {
            "< 128 words": sum(1 for l in lengths if l < 128),
            "128-256 words": sum(1 for l in lengths if 128 <= l < 256),
            "256-512 words": sum(1 for l in lengths if 256 <= l < 512),
            "> 512 words": sum(1 for l in lengths if l >= 512)
        }
    }

    # Calculate percentages for distribution
    total_samples = len(lengths)
    stats["length_distribution_percent"] = {
        k: (v/total_samples * 100) for k, v in stats["length_distribution"].items()
    }

    print("\nText Length Analysis (word-based):")
    print(f"Average length: {stats['average_length']:.1f} words")
    print(f"Median length: {stats['median_length']} words")
    print(f"Max length: {stats['max_length']} words")
    print(f"95th percentile: {stats['95th_percentile']} words")
    print("\nLength Distribution:")
    for category, count in stats["length_distribution"].items():
        percentage = stats["length_distribution_percent"][category]
        print(f"{category}: {count} texts ({percentage:.1f}%)")

    # Character-based analysis
    char_lengths = [len(text) for text in dataset["train"]["text"]]
    stats["char_stats"] = {
        "average_length": sum(char_lengths)/len(char_lengths),
        "max_length": max(char_lengths),
        "median_length": sorted(char_lengths)[len(char_lengths)//2],
        "95th_percentile": sorted(char_lengths)[int(len(char_lengths)*0.95)]
    }

    print("\nCharacter-based Analysis:")
    print(f"Average length: {stats['char_stats']['average_length']:.1f} characters")
    print(f"Median length: {stats['char_stats']['median_length']} characters")
    print(f"Max length: {stats['char_stats']['max_length']} characters")
    print(f"95th percentile: {stats['char_stats']['95th_percentile']} characters")

    return stats



# Step 8: Define Loss Functions

In [21]:
class WeightedFocalLoss(torch.nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, labels):
        # Apply softmax for multi-class probabilities
        probs = torch.softmax(logits, dim=1)[:, 1]  # Probability for positive class
        labels = labels.float()
        BCE_loss = torch.nn.functional.binary_cross_entropy(probs, labels, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

class SymmetricCrossEntropyLoss(torch.nn.Module):
    def __init__(self, alpha=0.1, beta=1.0):
        super(SymmetricCrossEntropyLoss, self).__init__()
        self.alpha = alpha
        self.beta = beta

    def forward(self, logits, labels):
        ce_loss = torch.nn.functional.cross_entropy(logits, labels)
        labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=logits.size(-1))
        rce_loss = -((torch.softmax(logits, dim=1) * labels_one_hot).sum(dim=-1).log().mean())
        return self.alpha * ce_loss + self.beta * rce_loss


class ModifiedBCEWithLogitsLoss(torch.nn.Module):
    def forward(self, logits, labels):
        # Ensure logits are the right shape (batch_size, num_classes)
        if len(logits.shape) == 1:
            logits = logits.unsqueeze(-1)
        # Get the positive class logits
        pos_logits = logits[:, 1]
        return torch.nn.functional.binary_cross_entropy_with_logits(
            pos_logits, labels.float(), reduction='mean'
        )

class ModifiedSquaredBCEWithLogitsLoss(torch.nn.Module):
    def forward(self, logits, labels):
        # Convert labels to float and ensure correct shape
        labels = labels.float().view(-1)

        # Ensure logits are the right shape for binary classification
        if len(logits.shape) > 1 and logits.shape[1] == 2:
            logits = logits[:, 1]  # Take the logit for positive class

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(logits)
        return torch.mean((probs - labels) ** 2)

class ModifiedWeightedBinaryCrossEntropy(torch.nn.Module):
    def __init__(self, pos_weight):
        super().__init__()
        self.pos_weight = pos_weight

    def forward(self, logits, labels):
        # Convert labels to float and ensure correct shape
        labels = labels.float().view(-1)

        # Ensure logits are the right shape for binary classification
        if len(logits.shape) > 1 and logits.shape[1] == 2:
            logits = logits[:, 1]  # Take the logit for positive class

        return torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels, pos_weight=self.pos_weight, reduction='mean'
        )

class ModifiedSupervisedContrastiveCrossEntropyLoss(torch.nn.Module):
    def __init__(self, temperature=0.07, lam=0.5):
        super().__init__()
        self.temperature = temperature
        self.lam = lam
        self.ce_loss = torch.nn.CrossEntropyLoss()

    def forward(self, logits, labels):
        # Standard cross-entropy loss
        ce_loss = self.ce_loss(logits, labels.long())

        # Contrastive loss
        normalized_logits = torch.nn.functional.normalize(logits, dim=1)
        similarity_matrix = torch.matmul(normalized_logits, normalized_logits.t()) / self.temperature

        # Create mask for positive pairs
        labels = labels.view(-1, 1)
        mask = (labels == labels.t()).float()

        # Compute contrastive loss
        exp_sim = torch.exp(similarity_matrix)
        log_prob = similarity_matrix - torch.log(exp_sim.sum(dim=1, keepdim=True))

        # Compute mean of positive pairs
        mask_sum = mask.sum(dim=1)
        mask_sum = torch.clamp(mask_sum, min=1e-8)  # Avoid division by zero
        con_loss = (mask * log_prob).sum(dim=1) / mask_sum
        con_loss = -con_loss.mean()

        # Combine losses
        return self.lam * ce_loss + (1 - self.lam) * con_loss

# Step 9: Loss Functions Factory

In [22]:
def get_loss_functions(device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    return {
        "CrossEntropyLoss": torch.nn.CrossEntropyLoss().to(device),
        "BCEWithLogitsLoss": ModifiedBCEWithLogitsLoss().to(device),
        "SquaredBCEWithLogitsLoss": ModifiedSquaredBCEWithLogitsLoss().to(device),
        "WeightedBinaryCrossEntropy": ModifiedWeightedBinaryCrossEntropy(
            pos_weight=torch.tensor([3.0]).to(device)
        ).to(device),
        "WeightedFocalLoss": WeightedFocalLoss(
            alpha=0.25,
            gamma=2
        ).to(device),
        "SymmetricCrossEntropy": SymmetricCrossEntropyLoss(
            alpha=0.1,
            beta=1.0
        ).to(device),
        "SupervisedContrastiveCrossEntropyLoss": ModifiedSupervisedContrastiveCrossEntropyLoss(
            temperature=0.07,
            lam=0.5
        ).to(device)
    }

# Step 10: Evaluation Metrics

In [23]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    try:
        roc_auc = roc_auc_score(labels, preds)
    except ValueError:
        roc_auc = 0  # Handle cases where there might be only one class

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='binary'),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'roc_auc': roc_auc
    }


# Step 11: Custom Trainer

In [24]:
class CustomTrainer(Trainer):
    def __init__(self, *args, loss_func=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_func = loss_func

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        if "labels" in inputs:
            # Ensure labels are on the correct device
            if not isinstance(inputs["labels"], torch.Tensor):
                inputs["labels"] = torch.tensor(inputs["labels"], device=self.args.device)

        # Forward pass
        outputs = model(**inputs)

        if self.loss_func is not None:
            logits = outputs.logits
            labels = inputs["labels"]

            # Binary classification losses need special handling
            if isinstance(self.loss_func, (ModifiedSquaredBCEWithLogitsLoss,
                                         ModifiedWeightedBinaryCrossEntropy,
                                         ModifiedBCEWithLogitsLoss)):
                loss = self.loss_func(logits, labels)
            else:
                # For cross entropy based losses
                labels = labels.long()
                loss = self.loss_func(logits, labels)
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

# Step 12: Model Save/Load Functions

In [25]:
def save_model_with_fallback(trainer, model_name):
    try:
        trainer.push_to_hub(f"jslai/{model_name}")
        print(f"Model saved to Hugging Face Hub as jslai/{model_name}")
    except Exception as e:
        print(f"Failed to save to Hugging Face Hub: {e}")
        try:
            trainer.save_model(os.path.join(drive_save_path, model_name))
            print(f"Model saved to Google Drive at {drive_save_path}/{model_name}")
        except Exception as e:
            print(f"Failed to save to Google Drive: {e}")
            trainer.save_model(os.path.join(local_save_path, model_name))
            print(f"Model saved locally at {local_save_path}/{model_name}")

def load_best_model(model_name):
    try:
        print(f"Attempting to load {model_name} from Hugging Face Hub.")
        model = AutoModelForSequenceClassification.from_pretrained(f"jslai/{model_name}")
    except (OSError, HTTPError) as e:
        print(f"Failed to load {model_name} from Hugging Face Hub: {e}")
        try:
            google_drive_path = os.path.join(drive_save_path, model_name)
            if os.path.isdir(google_drive_path):
                print(f"Attempting to load {model_name} from Google Drive.")
                model = AutoModelForSequenceClassification.from_pretrained(google_drive_path)
            else:
                raise OSError(f"Directory {google_drive_path} does not exist on Google Drive.")
        except (OSError, HTTPError) as e:
            print(f"Failed to load {model_name} from Google Drive: {e}")
            try:
                local_path = os.path.join(local_save_path, model_name)
                if os.path.isdir(local_path):
                    print(f"Attempting to load {model_name} from local storage.")
                    model = AutoModelForSequenceClassification.from_pretrained(local_path)
                else:
                    raise OSError(f"Directory {local_path} does not exist in local storage.")
            except (OSError, HTTPError) as e:
                print(f"Failed to load {model_name} from local storage: {e}")
                raise FileNotFoundError(f"Model {model_name} could not be found in any location.")
    return model

# Step 13: Training Functions

In [45]:

def full_fine_tune_all_models(model_checkpoints, dataset, loss_functions=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if loss_functions is None:
        loss_functions = get_loss_functions(device)

    for model_name, model_info in model_checkpoints.items():
        # Tokenize dataset for this model
        try:
            tokenized_data = tokenize_datasets(model_name, dataset)

            for loss_fn_name, loss_fn in loss_functions.items():
                print(f"\nTraining {model_name} with {loss_fn_name}")
                print(f"Using device: {device}")

                try:
                    # Clear CUDA cache
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    # Initialize model
                    model = AutoModelForSequenceClassification.from_pretrained(
                        model_info["path"],
                        num_labels=2,
                        problem_type="single_label_classification"
                    ).to(device)

                    # Training arguments
                    training_args = TrainingArguments(
                        output_dir=f"{local_save_path}/{model_name}_{loss_fn_name}_full_ft",
                        eval_strategy="epoch",  # Updated from evaluation_strategy
                        save_strategy="epoch",
                        learning_rate=2e-5,
                        per_device_train_batch_size=8,
                        per_device_eval_batch_size=8,
                        num_train_epochs=3,
                        weight_decay=0.01,
                        load_best_model_at_end=True,
                        metric_for_best_model="f1",
                        report_to="wandb",
                        logging_steps=100,
                        fp16=True,
                        fp16_backend="auto",
                        gradient_checkpointing=True,
                        gradient_accumulation_steps=2,
                        warmup_ratio=0.1,
                        dataloader_num_workers=4,
                        dataloader_pin_memory=True,
                        seed=42,
                        remove_unused_columns=False
                    )

                    # Initialize trainer
                    trainer = CustomTrainer(
                        model=model,
                        args=training_args,
                        train_dataset=tokenized_data["train"],
                        eval_dataset=tokenized_data["validation"],
                        compute_metrics=compute_metrics,
                        loss_func=loss_fn
                    )

                    # Train and save
                    trainer.train()
                    # save_model_with_fallback(trainer, f"{model_name}_{loss_fn_name}_full_ft")

                except Exception as e:
                    print(f"Error training {model_name} with {loss_fn_name}: {str(e)}")
                    continue

                finally:
                    # Cleanup
                    if 'trainer' in locals():
                        del trainer
                    if 'model' in locals():
                        del model
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing model {model_name}: {str(e)}")
            continue

    print("\nFull Fine-Tuning completed!")
    return

def peft_fine_tune_all_models(model_checkpoints, dataset, loss_functions=None, peft_methods=None):
    # Initialize device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Get loss functions if not provided
    if loss_functions is None:
        loss_functions = get_loss_functions(device)

    def get_target_modules(model_path):
        """Get target modules based on model architecture"""
        if "bert" in model_path.lower():
            return [
                f"bert.encoder.layer.{i}.attention.self.query" for i in range(12)
            ] + [
                f"bert.encoder.layer.{i}.attention.self.key" for i in range(12)
            ] + [
                f"bert.encoder.layer.{i}.attention.self.value" for i in range(12)
            ] + [
                f"bert.encoder.layer.{i}.attention.output.dense" for i in range(12)
            ]
        elif "roberta" in model_path.lower():
            return ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"]
        elif "deberta" in model_path.lower():
            return ["query_proj", "key_proj", "value_proj", "dense"]
        else:
            return ["query", "key", "value", "dense"]

    def get_feedforward_modules(model_path):
        """Get feedforward modules based on model architecture"""
        if "bert" in model_path.lower():
            return [f"bert.encoder.layer.{i}.intermediate.dense" for i in range(12)] + \
                   [f"bert.encoder.layer.{i}.output.dense" for i in range(12)]
        elif "roberta" in model_path.lower():
            return ["fc1", "fc2"]
        else:
            return ["dense"]

    # Define PEFT methods
    if peft_methods is None:
        peft_methods = {
            "lora": lambda path, modules: LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=16,
                lora_alpha=32,
                lora_dropout=0.1,
                bias="none",
                inference_mode=False,
                target_modules=modules,
                modules_to_save=["classifier"]
            ),
            "adalora": lambda path, modules: AdaLoraConfig(
                task_type=TaskType.SEQ_CLS,
                init_r=12,
                target_r=8,
                beta1=0.85,
                beta2=0.95,
                tinit=200,
                tfinal=1000,
                deltaT=10,
                lora_alpha=32,
                target_modules=modules,
                lora_dropout=0.1,
                inference_mode=False
            ),
            "prefix": lambda path, _: PrefixTuningConfig(
                task_type=TaskType.SEQ_CLS,
                num_virtual_tokens=20,
                prefix_projection=True,
                encoder_hidden_size=768  # Will be updated based on model
            ),
            "ia3": lambda path, modules: IA3Config(
                task_type=TaskType.SEQ_CLS,
                target_modules=modules,
                feedforward_modules=get_feedforward_modules(path),
                inference_mode=False,
                modules_to_not_convert=["classifier"]
            )
        }

    for model_name, model_info in model_checkpoints.items():
        try:
            # Get target modules for this model
            target_modules = get_target_modules(model_info["path"])

            # Tokenize dataset specific to model
            tokenized_data = tokenize_datasets(model_name, dataset)

            for peft_name, peft_config_fn in peft_methods.items():
                for loss_fn_name, loss_fn in loss_functions.items():
                    try:
                        print(f"\nFine-tuning {model_name} with PEFT ({peft_name}) using {loss_fn_name}")
                        print(f"Using device: {device}")

                        # Initialize tokenizer and base model
                        tokenizer = AutoTokenizer.from_pretrained(model_info["path"])
                        base_model = AutoModelForSequenceClassification.from_pretrained(
                            model_info["path"],
                            num_labels=2,
                            problem_type="single_label_classification"
                        )

                        # Get PEFT configuration
                        peft_config = peft_config_fn(model_info["path"], target_modules)

                        # Update encoder_hidden_size for prefix tuning
                        if peft_name == "prefix":
                            peft_config.encoder_hidden_size = base_model.config.hidden_size

                        # Print available modules for debugging
                        print(f"\nModel architecture for {model_name}:")
                        print("Available modules:", [name for name, _ in base_model.named_modules()])

                        # Get PEFT model
                        model = get_peft_model(base_model, peft_config)
                        print(f"\nTrainable parameters for {peft_name}:")
                        model.print_trainable_parameters()

                        # Move model to device
                        model = model.to(device)

                        # Ensure loss function is on correct device
                        loss_fn = loss_fn.to(device)

                        # Define training arguments
                        training_args = TrainingArguments(
                            output_dir=f"{local_save_path}/{model_name}_{loss_fn_name}_{peft_name}",
                            eval_strategy="epoch",
                            save_strategy="epoch",
                            learning_rate=2e-5,
                            per_device_train_batch_size=8,
                            per_device_eval_batch_size=8,
                            num_train_epochs=3,
                            weight_decay=0.01,
                            load_best_model_at_end=True,
                            metric_for_best_model="f1",
                            logging_dir="./logs",
                            report_to="wandb",
                            logging_steps=100,
                            fp16=True,
                            fp16_backend="auto",
                            gradient_checkpointing=True,
                            gradient_accumulation_steps=4,
                            optim="adamw_torch",
                            warmup_ratio=0.1,
                            dataloader_num_workers=4,
                            dataloader_pin_memory=True,
                            seed=42
                        )

                        # Initialize trainer
                        trainer = CustomTrainer(
                            model=model,
                            args=training_args,
                            train_dataset=tokenized_data["train"],
                            eval_dataset=tokenized_data["validation"],
                            tokenizer=tokenizer,
                            data_collator=DataCollatorWithPadding(tokenizer),
                            compute_metrics=compute_metrics,
                            loss_func=loss_fn
                        )

                        # Train the model
                        trainer.train()

                        # Save the model and adapter
                        # output_dir = f"{local_save_path}/{model_name}_{loss_fn_name}_{peft_name}"
                        # save_model_with_fallback(trainer, output_dir)
                        # model.save_pretrained(f"{output_dir}/adapter")

                        # Clear memory
                        del model, base_model
                        torch.cuda.empty_cache()

                    except Exception as e:
                        print(f"Error training {model_name} with {peft_name} and {loss_fn_name}: {str(e)}")
                        continue

        except Exception as e:
            print(f"Error processing model {model_name}: {str(e)}")
            continue

    print("\nPEFT fine-tuning completed!")
    return

# Step 14: Inference Function

In [27]:
def run_inference_and_save_results(model_checkpoints, test_df, results_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    predictions_df_list = []

    # Get list of all trained models
    loss_functions = get_loss_functions(device)

    for model_name, model_info in model_checkpoints.items():
        # For each training method (full fine-tuning with different loss functions)
        for loss_fn_name in loss_functions.keys():
            try:
                # Full fine-tuning model
                full_ft_model_name = f"{model_name}_{loss_fn_name}_full_ft"
                model = load_best_model(full_ft_model_name).to(device)
                tokenizer = AutoTokenizer.from_pretrained(model_info["path"])

                inputs = tokenizer(
                    list(test_df["text"]),
                    truncation=True,
                    padding=True,
                    max_length=model_info["max_length"],
                    return_tensors="pt"
                ).to(device)

                with torch.no_grad():
                    outputs = model(**inputs)
                    preds = outputs.logits.argmax(dim=-1).cpu().numpy()

                result_df = test_df.copy()
                result_df["prediction"] = preds
                predictions_df_list.append((full_ft_model_name, result_df))

                # PEFT model
                peft_model_name = f"{model_name}_{loss_fn_name}_peft_lora"
                peft_model_path = os.path.join(local_save_path, peft_model_name, "adapter")

                if os.path.exists(peft_model_path):
                    model = load_best_model(peft_model_name).to(device)

                    with torch.no_grad():
                        outputs = model(**inputs)
                        preds = outputs.logits.argmax(dim=-1).cpu().numpy()

                    result_df = test_df.copy()
                    result_df["prediction"] = preds
                    predictions_df_list.append((peft_model_name, result_df))

                # Clear memory
                del model
                torch.cuda.empty_cache()

            except Exception as e:
                print(f"Error during inference for {model_name}: {e}")
                continue

    # Save all predictions
    for model_name, result_df in predictions_df_list:
        result_file_path = os.path.join(results_dir, f"{model_name}_predictions.csv")
        result_df.to_csv(result_file_path, index=False)
        print(f"Saved predictions for {model_name} to {result_file_path}")


# Step 15: Main Execution

In [46]:
if __name__ == "__main__":
    # Initialize device and wandb
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize loss functions
    loss_functions = get_loss_functions(device)

    # Analyze dataset once
    print("Analyzing dataset text lengths...")
    dataset_stats = analyze_text_lengths(dataset)

    # Run full fine-tuning
    # print("\nStarting Full Fine-Tuning with all models and loss functions...")
    # full_fine_tune_all_models(model_checkpoints, dataset, loss_functions)

    # # Run PEFT fine-tuning
    print("\nStarting PEFT Fine-Tuning with LoRA on all models...")
    peft_fine_tune_all_models(model_checkpoints, dataset, loss_functions)

    # # Run inference and save results
    # print("\nRunning inference and saving predictions...")
    # run_inference_and_save_results(model_checkpoints, test_df, results_dir)

    print("\nExperiments completed!")
    wandb.finish()

Using device: cuda
Analyzing dataset text lengths...

Text Length Analysis (word-based):
Average length: 241.3 words
Median length: 137 words
Max length: 2611 words
95th percentile: 833 words

Length Distribution:
< 128 words: 488 texts (48.8%)
128-256 words: 170 texts (17.0%)
256-512 words: 199 texts (19.9%)
> 512 words: 143 texts (14.3%)

Character-based Analysis:
Average length: 1543.6 characters
Median length: 925 characters
Max length: 16013 characters
95th percentile: 5156 characters

Starting PEFT Fine-Tuning with LoRA on all models...
Using max_length=512 for model MBERT_uncased

Before tokenization:
Sample of original labels: [True, True, True, True, True]
Original label type: <class 'bool'>


Tokenizing train set (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing validation set (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing test set (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]


After tokenization:
Sample of processed labels: [1, 1, 1, 1, 1]
Processed label type: <class 'int'>

Fine-tuning MBERT_uncased with PEFT (lora) using CrossEntropyLoss
Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model architecture for MBERT_uncased:
Available modules: ['', 'bert', 'bert.embeddings', 'bert.embeddings.word_embeddings', 'bert.embeddings.position_embeddings', 'bert.embeddings.token_type_embeddings', 'bert.embeddings.LayerNorm', 'bert.embeddings.dropout', 'bert.encoder', 'bert.encoder.layer', 'bert.encoder.layer.0', 'bert.encoder.layer.0.attention', 'bert.encoder.layer.0.attention.self', 'bert.encoder.layer.0.attention.self.query', 'bert.encoder.layer.0.attention.self.key', 'bert.encoder.layer.0.attention.self.value', 'bert.encoder.layer.0.attention.self.dropout', 'bert.encoder.layer.0.attention.output', 'bert.encoder.layer.0.attention.output.dense', 'bert.encoder.layer.0.attention.output.LayerNorm', 'bert.encoder.layer.0.attention.output.dropout', 'bert.encoder.layer.0.intermediate', 'bert.encoder.layer.0.intermediate.dense', 'bert.encoder.layer.0.intermediate.intermediate_act_fn', 'bert.encoder.layer.0.output', 'bert.encoder.layer.0.output.dense', 'bert.encoder.layer.0.output.La

  super().__init__(*args, **kwargs)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Add any additional parameters to the configurations?
# Modify the configurations for better performance?
# Add more PEFT methods?
# Make any other changes?