In [1]:
!pip install transformers datasets pandas numpy torch peft bitsandbytes accelerate matplotlib seaborn scikit-learn tqdm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x8

In [2]:
import os
import torch
import numpy as np
import pandas as pd
import transformers
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel,
    PeftConfig,
    prepare_model_for_kbit_training
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

PyTorch version: 2.6.0+cu124
Transformers version: 4.51.3


In [4]:
# Set random seeds for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [36]:
# Common configuration parameters
MODEL_NAME = "google/flan-t5-small"  # Change to your preferred base model
MAX_LENGTH = 512
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 2e-5
SAVE_PATH = "./fine_tuned_models/"

os.makedirs(SAVE_PATH, exist_ok=True)

In [22]:
# Load and prepare dataset
def prepare_dataset():
    """Load and prepare the dataset for fine-tuning."""
    # Sample data - in a real scenario, load your own dataset
    # Here we'll use a small subset of the OpenAssistant Conversations Dataset
    dataset = load_dataset("OpenAssistant/oasst1", split="train")
    dataset = dataset.filter(lambda x: x["lang"] == "en").select(range(1000))

    # Format data for instruction tuning with the following structure:
    # 1. System message (optional)
    # 2. User query
    # 3. Assistant response
    formatted_data = []

    for item in dataset:
        if "text" in item and item["text"].strip():
            # Simple formatting example
            formatted_text = f"### Instruction: {item['text']}\n\n### Response: This is a helpful response."
            formatted_data.append({"text": formatted_text})

    # Split into train and validation sets
    train_data, val_data = train_test_split(formatted_data, test_size=0.1, random_state=RANDOM_SEED)

    return {"train": train_data, "validation": val_data}

In [23]:
# Tokenize dataset
def tokenize_dataset(dataset, tokenizer):
    """Tokenize the dataset for training."""

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH
        )

    tokenized_dataset = {}
    for split, data in dataset.items():
        texts = [item["text"] for item in data]
        tokenized_texts = tokenize_function({"text": texts})

        # Convert to the expected format for Trainer
        tokenized_dataset[split] = {
            "input_ids": tokenized_texts["input_ids"],
            "attention_mask": tokenized_texts["attention_mask"]
        }

    return tokenized_dataset

In [24]:
# Load tokenizer
def load_tokenizer(model_name):
    """Load and configure the tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [25]:
# Helper function to calculate model size
def get_model_size(model):
    """Calculate and return the model size in GB."""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb / 1024  # Convert MB to GB

In [26]:
# Helper function to free GPU memory
def free_memory():
    """Free GPU memory."""
    gc.collect()
    torch.cuda.empty_cache()

# 1. Full-Parameter Fine-tuning

In [27]:
def full_parameter_finetuning():
    """Implement full-parameter fine-tuning."""
    print("\n=== Starting Full-Parameter Fine-tuning ===")

    # Load tokenizer and prepare dataset
    tokenizer = load_tokenizer(MODEL_NAME)
    dataset = prepare_dataset()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Not using masked language modeling
    )

    # Load model
    print("Loading the model (this might take a while)...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    print(f"Model loaded successfully. Size: {get_model_size(model):.2f} GB")

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"{SAVE_PATH}/full_param_finetuned",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir=f"{SAVE_PATH}/logs",
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        fp16=True,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="tensorboard"
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=list(tokenized_dataset["train"].values()),
        eval_dataset=list(tokenized_dataset["validation"].values()),
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train model
    print("Starting training...")
    trainer.train()

    # Save model
    print("Saving model...")
    trainer.save_model(f"{SAVE_PATH}/full_param_finetuned_final")
    tokenizer.save_pretrained(f"{SAVE_PATH}/full_param_finetuned_final")

    print("Full-parameter fine-tuning completed!")

    # Free memory
    del model, trainer
    free_memory()

# 2. Partial Fine-tuning (Freezing certain layers)

In [28]:
def partial_finetuning():
    """Implement partial fine-tuning by freezing earlier layers."""
    print("\n=== Starting Partial Fine-tuning ===")

    # Load tokenizer and prepare dataset
    tokenizer = load_tokenizer(MODEL_NAME)
    dataset = prepare_dataset()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Load model
    print("Loading the model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Freeze the bottom layers (typically the first 70-80% for partial fine-tuning)
    print("Freezing bottom layers...")

    # For transformer-based models
    modules = list(model.named_modules())

    # Count the transformer blocks/layers
    transformer_blocks = [name for name, _ in modules if "block" in name or "layer" in name]
    num_layers = len(transformer_blocks)
    num_frozen = int(num_layers * 0.7)  # Freeze 70% of the layers

    # Freeze specific layers
    for name, param in model.named_parameters():
        layer_num = None
        for i in range(num_layers):
            if f"block.{i}." in name or f"layer.{i}." in name:
                layer_num = i
                break

        # Freeze parameters in the bottom layers
        if layer_num is not None and layer_num < num_frozen:
            param.requires_grad = False

    # Count trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%} of total)")

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"{SAVE_PATH}/partial_finetuned",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir=f"{SAVE_PATH}/logs",
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        fp16=True,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="tensorboard"
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=list(tokenized_dataset["train"].values()),
        eval_dataset=list(tokenized_dataset["validation"].values()),
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train model
    print("Starting training...")
    trainer.train()

    # Save model
    print("Saving model...")
    trainer.save_model(f"{SAVE_PATH}/partial_finetuned_final")
    tokenizer.save_pretrained(f"{SAVE_PATH}/partial_finetuned_final")

    print("Partial fine-tuning completed!")

    # Free memory
    del model, trainer
    free_memory()

# 3. LoRA (Low-Rank Adaptation) Fine-tuning

In [29]:
def lora_finetuning():
    """Implement LoRA fine-tuning."""
    print("\n=== Starting LoRA Fine-tuning ===")

    # Load tokenizer and prepare dataset
    tokenizer = load_tokenizer(MODEL_NAME)
    dataset = prepare_dataset()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Load model in 16-bit precision
    print("Loading the base model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Define LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,                       # Rank of update matrices
        lora_alpha=32,             # Alpha parameter for LoRA scaling
        lora_dropout=0.1,          # Dropout probability for LoRA layers
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Which modules to apply LoRA to
    )

    # Get PEFT model
    print("Applying LoRA adapters...")
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()  # Print trainable parameters info

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"{SAVE_PATH}/lora_finetuned",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir=f"{SAVE_PATH}/logs",
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        fp16=True,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="tensorboard"
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=list(tokenized_dataset["train"].values()),
        eval_dataset=list(tokenized_dataset["validation"].values()),
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train model
    print("Starting training...")
    trainer.train()

    # Save model
    print("Saving LoRA adapters...")
    model.save_pretrained(f"{SAVE_PATH}/lora_finetuned_final")
    tokenizer.save_pretrained(f"{SAVE_PATH}/lora_finetuned_final")

    print("LoRA fine-tuning completed!")

    # Free memory
    del model, trainer
    free_memory()

# 4. QLoRA (Quantized LoRA) Fine-tuning

In [30]:
def qlora_finetuning():
    """Implement QLoRA fine-tuning."""
    print("\n=== Starting QLoRA Fine-tuning ===")

    # Load tokenizer and prepare dataset
    tokenizer = load_tokenizer(MODEL_NAME)
    dataset = prepare_dataset()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Configure quantization
    print("Configuring 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    # Load model with quantization
    print("Loading the quantized model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Define LoRA configuration for QLoRA
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    )

    # Get PEFT model
    print("Applying LoRA adapters to quantized model...")
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"{SAVE_PATH}/qlora_finetuned",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir=f"{SAVE_PATH}/logs",
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="tensorboard"
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=list(tokenized_dataset["train"].values()),
        eval_dataset=list(tokenized_dataset["validation"].values()),
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train model
    print("Starting training...")
    trainer.train()

    # Save model
    print("Saving QLoRA adapters...")
    model.save_pretrained(f"{SAVE_PATH}/qlora_finetuned_final")
    tokenizer.save_pretrained(f"{SAVE_PATH}/qlora_finetuned_final")

    print("QLoRA fine-tuning completed!")

    # Free memory
    del model, trainer
    free_memory()

# 5. PEFT (Parameter-Efficient Fine-Tuning) with different methods

In [31]:
def peft_finetuning():
    """Implement PEFT fine-tuning with different methods."""
    print("\n=== Starting PEFT Fine-tuning (Prefix Tuning) ===")

    # Load tokenizer and prepare dataset
    tokenizer = load_tokenizer(MODEL_NAME)
    dataset = prepare_dataset()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Load model
    print("Loading the model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Define PEFT configuration for Prefix Tuning
    from peft import PrefixTuningConfig

    peft_config = PrefixTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        num_virtual_tokens=20,  # Number of virtual tokens to use
        prefix_projection=True,  # Whether to use a projection layer
    )

    # Get PEFT model
    print("Applying Prefix Tuning...")
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"{SAVE_PATH}/peft_prefix_finetuned",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir=f"{SAVE_PATH}/logs",
        logging_steps=10,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        fp16=True,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="tensorboard"
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=list(tokenized_dataset["train"].values()),
        eval_dataset=list(tokenized_dataset["validation"].values()),
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train model
    print("Starting training...")
    trainer.train()

    # Save model
    print("Saving PEFT adapters...")
    model.save_pretrained(f"{SAVE_PATH}/peft_prefix_finetuned_final")
    tokenizer.save_pretrained(f"{SAVE_PATH}/peft_prefix_finetuned_final")

    print("PEFT (Prefix Tuning) fine-tuning completed!")

    # Free memory
    del model, trainer
    free_memory()

In [32]:
# Function to load and use fine-tuned models
def load_and_use_model(model_type):
    """Load and demonstrate usage of fine-tuned models."""
    print(f"\n=== Loading and Using {model_type} Model ===")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"{SAVE_PATH}/{model_type}_finetuned_final")

    # Load model based on type
    if model_type in ["lora", "qlora", "peft_prefix"]:
        # For PEFT models (LoRA, QLoRA, Prefix Tuning)
        # First load the base model
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        # Then load the PEFT adapters
        model = PeftModel.from_pretrained(
            base_model,
            f"{SAVE_PATH}/{model_type}_finetuned_final",
            device_map="auto"
        )
    else:
        # For full-parameter or partial fine-tuning
        model = AutoModelForCausalLM.from_pretrained(
            f"{SAVE_PATH}/{model_type}_finetuned_final",
            torch_dtype=torch.float16,
            device_map="auto"
        )

    # Example usage
    test_prompt = "### Instruction: Explain how neural networks work.\n\n### Response:"

    inputs = tokenizer(test_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Model response to test prompt:\n{response}")

    # Free memory
    del model
    free_memory()

In [33]:
# Compare fine-tuning methods
def compare_methods():
    """Compare different fine-tuning methods."""
    print("\n=== Comparing Fine-tuning Methods ===")

    # Define metrics to compare
    methods = ["Full-Parameter", "Partial", "LoRA", "QLoRA", "PEFT (Prefix)"]

    # Example metrics (in a real scenario, use actual measurements)
    training_time = [240, 180, 50, 40, 45]  # minutes
    memory_usage = [24, 18, 8, 4, 6]  # GB
    parameter_efficiency = [0, 30, 99.9, 99.9, 99.5]  # % reduction in trainable parameters
    inference_performance = [100, 100, 95, 90, 93]  # % relative to full fine-tuning

    # Create comparison plots
    plt.figure(figsize=(15, 10))

    # Training time
    plt.subplot(2, 2, 1)
    plt.bar(methods, training_time, color='skyblue')
    plt.title('Training Time (minutes)')
    plt.xticks(rotation=45)

    # Memory usage
    plt.subplot(2, 2, 2)
    plt.bar(methods, memory_usage, color='lightgreen')
    plt.title('Memory Usage (GB)')
    plt.xticks(rotation=45)

    # Parameter efficiency
    plt.subplot(2, 2, 3)
    plt.bar(methods, parameter_efficiency, color='salmon')
    plt.title('Parameter Efficiency (% reduction)')
    plt.xticks(rotation=45)

    # Inference performance
    plt.subplot(2, 2, 4)
    plt.bar(methods, inference_performance, color='purple')
    plt.title('Inference Performance (% relative)')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig(f"{SAVE_PATH}/comparison_chart.png")
    plt.close()

    # Create a comparison table
    comparison_data = {
        'Method': methods,
        'Training Time (min)': training_time,
        'Memory Usage (GB)': memory_usage,
        'Parameter Efficiency (%)': parameter_efficiency,
        'Inference Performance (%)': inference_performance
    }

    comparison_df = pd.DataFrame(comparison_data)
    print("\nComparison of Fine-tuning Methods:")
    print(comparison_df.to_string(index=False))

    # Save comparison table
    comparison_df.to_csv(f"{SAVE_PATH}/comparison_table.csv", index=False)

    print(f"\nComparison chart saved to {SAVE_PATH}/comparison_chart.png")
    print(f"Comparison table saved to {SAVE_PATH}/comparison_table.csv")

In [34]:
# Main function to run all implementations
def main():
    """Run all fine-tuning implementations."""
    print("Starting LLM Fine-tuning Methods Implementation")

    # Uncomment the methods you want to run
    full_parameter_finetuning()
    partial_finetuning()
    lora_finetuning()
    qlora_finetuning()
    peft_finetuning()

    # Load and use models
    # load_and_use_model("full_param")
    # load_and_use_model("partial")
    # load_and_use_model("lora")
    # load_and_use_model("qlora")
    # load_and_use_model("peft_prefix")

    # Compare methods
    compare_methods()

    print("\nAll implementations completed!")

In [37]:
if __name__ == "__main__":
    main()

Starting LLM Fine-tuning Methods Implementation

=== Starting Full-Parameter Fine-tuning ===


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Loading the model (this might take a while)...


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of AriaTextConfig, BambaConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, DeepseekV3Config, DiffLlamaConfig, ElectraConfig, Emu3Config, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, GitConfig, GlmConfig, Glm4Config, GotOcr2Config, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, GraniteMoeSharedConfig, HeliumConfig, JambaConfig, JetMoeConfig, LlamaConfig, Llama4Config, Llama4TextConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MllamaConfig, MoshiConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, Phi4MultimodalConfig, PhimoeConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, Qwen3Config, Qwen3MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, ZambaConfig, Zamba2Config.