In [None]:
!pip install git+https://github.com/YOUR_USERNAME/mpdistil.git -q

## Method 1: From Lists of Texts

In [None]:
from mpdistil import MPDistil, create_simple_dataloader

# Example: Sentiment analysis data
train_texts = [
    "I love this movie!",
    "This is terrible.",
    "Great film, highly recommended.",
    "Waste of time.",
    # Add more examples...
]

train_labels = [1, 0, 1, 0]  # 1=positive, 0=negative

# Create DataLoader
train_loader = create_simple_dataloader(
    texts=train_texts,
    labels=train_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8,
    shuffle=True
)

# Similarly for validation
val_texts = ["Amazing!", "Not good."]
val_labels = [1, 0]

val_loader = create_simple_dataloader(
    texts=val_texts,
    labels=val_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8,
    shuffle=False
)

## Method 2: From Sentence Pairs

In [None]:
# For tasks like NLI, QA, paraphrase detection
train_pairs = [
    ("The cat sat on the mat.", "A cat is on a mat."),
    ("It's raining heavily.", "The sun is shining."),
    # Add more pairs...
]

train_labels = [1, 0]  # 1=entailment, 0=contradiction

train_loader = create_simple_dataloader(
    texts=train_pairs,
    labels=train_labels,
    tokenizer_name='bert-base-uncased',
    max_length=128,
    batch_size=8
)

## Method 3: From Pandas DataFrame

In [None]:
import pandas as pd

# Load your CSV/Excel file
df = pd.DataFrame({
    'text': ['Great product!', 'Disappointed', 'Excellent!'],
    'label': [1, 0, 1]
})

# Split into train/val
train_df = df[:2]
val_df = df[2:]

# Create loaders
train_loader = create_simple_dataloader(
    texts=train_df['text'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer_name='bert-base-uncased'
)

val_loader = create_simple_dataloader(
    texts=val_df['text'].tolist(),
    labels=val_df['label'].tolist(),
    tokenizer_name='bert-base-uncased'
)

## Method 4: Custom PyTorch Dataset (Advanced)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # IMPORTANT: Return in this exact order
        return (
            encoding['input_ids'].squeeze(),
            encoding['attention_mask'].squeeze(),
            encoding['token_type_ids'].squeeze(),
            torch.tensor(self.labels[idx])
        )

# Create dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = CustomDataset(
    texts=["Example 1", "Example 2"],
    labels=[0, 1],
    tokenizer=tokenizer
)

# Create loader
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

## Train Model with Custom Data

In [None]:
# Initialize model
model = MPDistil(
    task_name='MyCustomTask',
    num_labels=2,  # Binary classification
    student_layers=6
)

# Train
history = model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    teacher_epochs=5,
    student_epochs=5
)

# Save
model.save_student('./my_custom_student')

## Tips for Custom Data

1. **Batch Format**: Ensure your DataLoader returns tuples in this order:
   - `(input_ids, attention_mask, token_type_ids, labels)`

2. **Tokenization**: Use the same tokenizer for train/val/test

3. **Max Length**: Choose based on your text length distribution

4. **Label Encoding**: Labels should be integers (0, 1, 2, ...)

5. **Data Split**: MPDistil auto-creates a held-out set from training data