In [None]:
!pip install git+https://github.com/yashpatel2010/mpdistil.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for mpdistil (pyproject.toml) ... [?25l[?25hdone


## Method 1: From Lists of Texts

In [None]:
from mpdistil import MPDistil, create_simple_dataloader

# Example: Sentiment analysis data
train_texts = [
    "I love this movie!",
    "This is terrible.",
    "Great film, highly recommended.",
    "Waste of time.",
]

train_labels = [1, 0, 1, 0]  # 1=positive, 0=negative

# Create DataLoader
train_loader = create_simple_dataloader(
    texts=train_texts,
    labels=train_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8,
    shuffle=True
)

# Similarly for validation
val_texts = ["Amazing!", "Not good."]
val_labels = [1, 0]

val_loader = create_simple_dataloader(
    texts=val_texts,
    labels=val_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8,
    shuffle=False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Method 2: From Sentence Pairs

In [None]:
# For tasks like NLI, QA, paraphrase detection
train_pairs = [
    ("The cat sat on the mat.", "A cat is on a mat."),
    ("It's raining heavily.", "The sun is shining."),
]

train_labels = [1, 0]  # 1=entailment, 0=contradiction

train_loader = create_simple_dataloader(
    texts=train_pairs,
    labels=train_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8
)

## Method 3: From Pandas DataFrame

In [None]:
import pandas as pd

# Load your CSV/Excel file
df = pd.DataFrame({
    'text': ['Great product!', 'Disappointed', 'Excellent!'],
    'label': [1, 0, 1]
})

# Split into train/val
train_df = df[:2]
val_df = df[2:]

# Create loaders
train_loader = create_simple_dataloader(
    texts=train_df['text'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer_name='bert-base-uncased'
)

val_loader = create_simple_dataloader(
    texts=val_df['text'].tolist(),
    labels=val_df['label'].tolist(),
    tokenizer_name='bert-base-uncased'
)

## Method 4: Custom PyTorch Dataset (Advanced)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        #Return in this exact order
        return (
            encoding['input_ids'].squeeze(),
            encoding['attention_mask'].squeeze(),
            encoding['token_type_ids'].squeeze(),
            torch.tensor(self.labels[idx])
        )

# Create dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = CustomDataset(
    texts=["Example 1", "Example 2"],
    labels=[0, 1],
    tokenizer=tokenizer
)

# Create loader
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

## Train Model with Custom Data

In [None]:
from mpdistil import create_simple_dataloader

texts = [
    "This is great!",
    "This is terrible!",
    "I love this product",
    "Worst experience ever",
    "Highly recommended"
]

labels = [1, 0, 1, 0, 1]  # Must match length of texts

# Validation data
val_texts = [
    "Pretty good",
    "Not bad"
]

val_labels = [1, 0]

# Create dataloaders
train_loader = create_simple_dataloader(
    texts=texts,
    labels=labels,
    batch_size=2,
    max_length=128
)

val_loader = create_simple_dataloader(
    texts=val_texts,
    labels=val_labels,
    batch_size=2,
    max_length=128
)

# Diagnostic checks
print(f"Train loader length: {len(train_loader)}")
print(f"Train dataset length: {len(train_loader.dataset)}")
print(f"Val loader length: {len(val_loader)}")
print(f"Val dataset length: {len(val_loader.dataset)}")

# Train model
model = MPDistil(task_name='MyCustomTask', num_labels=2, student_layers=6)
history = model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    teacher_epochs=2,
    student_epochs=2
)

# Save
model.save_student('./my_custom_student')

Train loader length: 3
Train dataset length: 5
Val loader length: 1
Val dataset length: 2
Using device: cuda

Validating DataLoaders...

Preparing task loaders...
Tasks: ['MyCustomTask']
Label counts: {'mycustomtask': 2}

Initializing models...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Model Sizes:
  Teacher: 109,484,547 parameters
  Student: 66,957,315 parameters (61.2% of teacher)
  Action:  769 parameters

Starting MPDistil Training

=== Phase 1: Teacher Fine-tuning ===


Phase 1 Epoch 1/2: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s, loss=0.721]


Epoch 1: Train Loss=0.6746, Val Metrics={'acc': 1.0, 'val_loss': 0.6623160243034363, 'task': 'MyCustomTask'}


Phase 1 Epoch 2/2: 100%|██████████| 2/2 [00:00<00:00,  8.41it/s, loss=0.457]


Epoch 2: Train Loss=0.4930, Val Metrics={'acc': 0.5, 'val_loss': 0.6661292314529419, 'task': 'MyCustomTask'}

=== Phase 2: Student Knowledge Distillation ===


  pkd_loss = F.mse_loss(s_features, t_features, reduction="mean")
Phase 2 Epoch 1/2: 100%|██████████| 2/2 [00:00<00:00,  7.04it/s, total_loss=0.64, task_loss=0.752]


Epoch 1: Train Loss=0.6320, Task Loss=0.7366, Val Metrics={'acc': 0.5, 'val_loss': 0.6909781694412231, 'task': 'MyCustomTask'}


  pkd_loss = F.mse_loss(s_features, t_features, reduction="mean")
Phase 2 Epoch 2/2: 100%|██████████| 2/2 [00:00<00:00, 11.49it/s, total_loss=0.577, task_loss=0.637]


Epoch 2: Train Loss=0.5882, Task Loss=0.6577, Val Metrics={'acc': 0.5, 'val_loss': 0.6620499491691589, 'task': 'MyCustomTask'}

=== Phase 3: Meta-Teacher Learning ===


Phase 3: Meta-Teacher: 100%|██████████| 1/1 [00:00<00:00, 26.97it/s, meta_loss=0.758]

Phase 3: Meta Loss=0.7581, Val Metrics={'acc': 0.5, 'val_loss': 0.7517795562744141, 'task': 'MyCustomTask'}






Training Complete!

Saving student model to ./my_custom_student...
Model saved successfully!


## Tips for Custom Data

1. **Batch Format**: Ensure your DataLoader returns tuples in this order:
   - `(input_ids, attention_mask, token_type_ids, labels)`

2. **Tokenization**: Use the same tokenizer for train/val/test

3. **Max Length**: Choose based on your text length distribution

4. **Label Encoding**: Labels should be integers (0, 1, 2, ...)

5. **Data Split**: MPDistil auto-creates a held-out set from training data