## Пресет дообучения LLM на задачу классификации

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        
        return item

In [4]:
class Trainer:
    def __init__(self, model_name, train_dataset=None, val_dataset=None, 
                 epochs=3, batch_size=16, lr=5e-5, device=None):
        
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        
        if train_dataset:
            self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        else:
            self.train_loader = None
        
        if val_dataset:
            self.val_loader = DataLoader(val_dataset, batch_size=batch_size)
        else:
            self.val_loader = None
        
        self.optimizer = AdamW(self.model.parameters(), lr=lr)
        self.epochs = epochs
        
        if self.train_loader:
            num_training_steps = epochs * len(self.train_loader)
            self.lr_scheduler = get_scheduler(
                "linear",
                optimizer=self.optimizer,
                num_warmup_steps=0,
                num_training_steps=num_training_steps
            )
        else:
            self.lr_scheduler = None
    
    def train_epoch(self):
        self.model.train()
        losses = []
        for batch in tqdm(self.train_loader):
            batch = {k: v.to(self.device) for k, v in batch.items()}
            outputs = self.model(**batch)
            loss = outputs.loss
            
            loss.backward()
            self.optimizer.step()
            self.lr_scheduler.step()
            self.optimizer.zero_grad()
            
            losses.append(loss.item())
        return sum(losses) / len(losses)
    
    def evaluate(self):
        if not self.val_loader:
            return None
        self.model.eval()
        total, correct = 0, 0
        with torch.no_grad():
            for batch in self.val_loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=-1)
                total += batch['labels'].size(0)
                correct += (preds == batch['labels']).sum().item()
        return correct / total
    
    def train(self):
        if not self.train_loader:
            print("No training data provided!")
            return
        
        for epoch in range(1, self.epochs + 1):
            train_loss = self.train_epoch()
            val_acc = self.evaluate()
            print(f"Epoch {epoch} | Train loss: {train_loss:.4f} | Val Acc: {val_acc if val_acc is not None else 'N/A'}")
    
    def save_model(self, path):
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)
        print(f"Model and tokenizer saved to {path}")
    
    @staticmethod
    def load_model(path, device=None):
        device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForSequenceClassification.from_pretrained(path).to(device)
        
        trainer = Trainer.__new__(Trainer)
        trainer.device = device
        trainer.tokenizer = tokenizer
        trainer.model = model
        trainer.train_loader = None
        trainer.val_loader = None
        trainer.optimizer = None
        trainer.epochs = 0
        trainer.lr_scheduler = None
        return trainer
    
    def predict(self, texts, max_length=128):
        self.model.eval()
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
        
        with torch.no_grad():
            outputs = self.model(**encoded_input)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
        
        return preds.cpu().tolist()


In [5]:
texts = ["I love programming.", "I hate bugs.", "Transformers are great!"]

In [6]:
labels = [1, 0, 1]

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.33, random_state=42)

In [8]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [10]:
trainer = Trainer(model_name, train_dataset, val_dataset, epochs=3,batch_size=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
trainer.train()

100%|██████████| 1/1 [00:00<00:00,  1.85it/s]


Epoch 1 | Train loss: 0.6995 | Val Acc: 1.0


100%|██████████| 1/1 [00:00<00:00,  5.48it/s]


Epoch 2 | Train loss: 0.6342 | Val Acc: 0.0


100%|██████████| 1/1 [00:00<00:00,  5.62it/s]


Epoch 3 | Train loss: 0.5532 | Val Acc: 0.0


In [12]:
trainer.save_model("./saved_bert_model")

Model and tokenizer saved to ./saved_bert_model


In [13]:
loaded_trainer = Trainer.load_model("./saved_bert_model")

In [14]:
texts = ["This is great!", "I hate this product."]
predictions = loaded_trainer.predict(texts)
print(predictions)

[1, 0]
