### Step 1 : Combine all the paraquet files

In [None]:
import torch
import torch.nn as nn
from transformers import XLMRobertaForMaskedLM, AutoTokenizer, XLMRobertaTokenizerFast
import pandas as pd

In [2]:
DATA_PATH = "/home/krrish/Desktop/Programming/slm-distill/dataset/hin/train"
TEST_DATA = "/home/krrish/Desktop/Programming/slm-distill/dataset/hin/train/data-0.parquet"
TEACHER_PATH = "xlm-roberta-base"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class TeacherModel(nn.Module):
    def __init__(
        self,
        model_path: str,
        device: torch.device = torch.device("cpu"),
    ):
        super().__init__()
        
        self.device = device
        self.model = XLMRobertaForMaskedLM.from_pretrained(model_path).to(device)
        
        for param in self.model.parameters():
            param.requires_grad = False
        
        self.model.eval()
    
    def forward(self, input_ids, attention_mask=None, return_logits=True):
        
        with torch.no_grad():
            outputs = self.model(
                input_ids=input_ids.to(self.device),
                attention_mask=attention_mask.to(self.device) if attention_mask is not None else None,
                output_hidden_states=True,
            )
            
        if return_logits:
            return outputs.logits  # [batch_size, seq_len, vocab_size]
        else:
            return outputs
    
    def get_num_parameters(self):
        return sum(p.numel() for p in self.parameters())
    
    def get_num_trainable_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [4]:
import torch
import torch.nn as nn
from transformers import XLMRobertaForMaskedLM, XLMRobertaConfig


class StudentModel(nn.Module):
    def __init__(
        self,
        vocab_size: int = 250002,
        hidden_size: int = 256,
        num_hidden_layers: int = 6,
        num_attention_heads: int = 8,
        intermediate_size: int = 1024,
        max_position_embeddings: int = 512,
        hidden_dropout_prob: float = 0.1,
        attention_probs_dropout_prob: float = 0.1,
        pad_token_id: int = 1,
        device: torch.device = torch.device("cpu"),
        use_gradient_checkpointing: bool = False,
    ):
        super().__init__()
        
        # Validate
        assert hidden_size % num_attention_heads == 0, \
            f"hidden_size ({hidden_size}) must be divisible by num_attention_heads ({num_attention_heads})"
        
        self.device = device
        
        # XLM-RoBERTa Config
        config = XLMRobertaConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            max_position_embeddings=max_position_embeddings,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            pad_token_id=pad_token_id,
        )
        
        self.config = config
        self.model = XLMRobertaForMaskedLM(config).to(self.device)
        
        if use_gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
    
    def forward(
        self,
        input_ids,
        attention_mask=None,
        labels=None,
        return_logits=True
    ):
        input_ids = input_ids.to(self.device)
        
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device)
            
        if labels is not None:
            labels = labels.to(self.device)
        
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=False,
        )
        
        if return_logits:
            return outputs.logits  # [batch_size, seq_len, vocab_size]
        else:
            return outputs
    
    def get_num_parameters(self):
        return sum(p.numel() for p in self.parameters())
    
    def get_trainable_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def get_config(self):
        return self.config  
    


In [5]:
"""Simple dataset class for parquet data"""

import os
import pandas as pd
from datasets import Dataset, load_from_disk
from torch.utils.data import Dataset as TorchDataset


class NativeSLMData(TorchDataset):
    def __init__(
        self,
        data_path: str,
        tokenizer,
        max_length: int,
        split: str = "train",
        train_split: float = 0.95,
        seed: int = 42,
        cache_dir: str = None
    ):
        cache_path = None
        
        if cache_dir:
            cache_path = os.path.join(
                cache_dir, 
                f"{os.path.basename(data_path)}_{split}_ml{max_length}"
            )
            
            if os.path.exists(cache_path):
                self.dataset = load_from_disk(cache_path).with_format("torch")
                return
        
        df = pd.read_parquet(data_path)
        dataset = Dataset.from_pandas(df)
        
        split_dataset = dataset.train_test_split(train_size=train_split, seed=seed)
        dataset = split_dataset['train'] if split == 'train' else split_dataset['test']
        
        self.dataset = dataset.map(
            
            lambda x: tokenizer(
                x["text"],
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_special_tokens_mask=True,
            ),
            batched=True,
            batch_size=1024,
            remove_columns=dataset.column_names,
            
        ).with_format("torch")
        
        # Save to cache
        if cache_path:
            os.makedirs(cache_dir, exist_ok=True)
            print(f"Saving to cache: {cache_path}")
            self.dataset.save_to_disk(cache_path)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]

In [6]:
teacher = TeacherModel(model_path=TEACHER_PATH, device=device)
tokenizer = AutoTokenizer.from_pretrained(TEACHER_PATH, use_fast = True)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
teacher.get_num_parameters(), teacher.get_num_trainable_parameters()

(278295186, 0)

Okay works that teacher doesn't have any trainable parameters. Check passed âœ…

In [8]:
tokenizer.pad_token_type_id

0

In [9]:
student = StudentModel()

In [None]:
student.get_num_parameters(), student.get_trainable_parameters()

(69187474, 69187474)

: 

In [None]:
data = NativeSLMData(
    data_path=DATA_PATH,
    tokenizer=tokenizer,
    max_length=128,
    split="train",
)