In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification,\
AdamW, get_linear_schedule_with_warmup
import pandas as pd
from tqdm import tqdm
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Sampler

**Данные те же, что и в BertPreTraining**

In [None]:
train_df = pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/train.csv')
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/test.csv')
submission=pd.read_csv('/kaggle/input/sentiment-analysis-company-reviews/sample_submission.csv')

In [None]:
y_train = train_df['Rating']
train = train_df['Review']
test = test_df['Review']


In [None]:
def convert_to_dataset_torch(data: pd.DataFrame):
    input_ids = []
    attention_masks = []
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    
    for row in tqdm(data, total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row, max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, 
                                             return_tensors='pt', 
                                             truncation=True,
                                            )
       
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        

   
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    input_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    return input_ids, attention_masks

In [None]:
def dataprep(data: pd.DataFrame, batch_size: int, sampler: Sampler, labels=None): -> DataLoader
        inps, masks = convert_to_dataset_torch(data)
        if labels:
            labels = torch.tensor(labels)
            encoded = TensorDataset(inps, masks, labels)
        else:
            encoded = TensorDataset(inps, masks)
        
        dataloader = DataLoader(
            encoded,  
            sampler = sampler(encoded),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )

        return dataloader

In [None]:
train_dataloader = dataprep(
    data=train,
    labels=y_train-1,
    batch_size=16,
    sampler=RandomSampler
)

test_dataloader = dataprep(
    data=test,
    batch_size=16,
    sampler=SequentialSampler
)

In [None]:
model_to_train = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                           num_labels=5)
optimizer = AdamW(model_to_train.parameters(),lr=2e-5,eps=1e-8)

In [None]:
def trainloop(model, optimizer, dataloader, device, epochs):
    model.to(device);
    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)
    for epoch_i in tqdm(range(0, epochs)):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        total_train_loss = 0

        losses = []
        model.train()

        for step, batch in enumerate(dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            optimizer.zero_grad()        

            loss = model(input_ids=b_input_ids, 
                                 attention_mask=b_input_mask, 
                                 labels=b_labels).loss

            total_train_loss += loss.item()
            losses.append(loss.item())
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

            if len(losses) == 20:
                print("Loss: {}".format(sum(losses)/len(losses)))
                losses = []

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    
    return model

In [None]:
trained_model = trainloop(
    model=model_to_train,
    optimizer=optimizer,
    dataloader=train_dataloader,
    device='cuda:0',
    epochs=3
)

In [None]:
torch.save(trained_model, 'distilbert3eps.pth')

In [None]:
def evaluation(model, dataloader, device):
    preds = []
    for step, batch in enumerate(tqdm(dataloader)):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)  
            preds.append(torch.argmax(torch.nn.Softmax()(model(input_ids=b_input_ids, 
                                 attention_mask=b_input_mask, 
                                 ).logits), dim=1).cpu().detach().numpy() + 1) # Метка класса
    return np.array(preds).flatten()

In [None]:
predictions = evaluation(
    model=trained_model, 
    dataloader=test_dataloader, 
    device='cuda:0'
)

In [None]:
submission['Rating'] = predictions
submission.to_csv('distilbert3eps.csv', index=False)