In [2]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoModel, AutoTokenizer
import torch
from torch import FloatTensor
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import progressbar as pb

In [3]:
# Глобальные константы, прописываем в отдельные преременные для удобства

RANDOM_SEED = 42
PATH_TO_RAW_DATA = os.path.join('data', 'raw', 'geo-reviews-dataset-2023.csv')

In [4]:
# Загружаем датасэт

raw_data_set = pd.read_csv(PATH_TO_RAW_DATA)
print(f'Объем сырого датасэта: {len(raw_data_set)}')
raw_data_set.head(5)

Объем сырого датасэта: 500000


Unnamed: 0,address,name_ru,rating,rubrics,text
0,"Екатеринбург, ул. Московская / ул. Волгоградск...",Московский квартал,3.0,Жилой комплекс,Московский квартал 2.\nШумно : летом по ночам ...
1,"Московская область, Электросталь, проспект Лен...",Продукты Ермолино,5.0,Магазин продуктов;Продукты глубокой заморозки;...,"Замечательная сеть магазинов в общем, хороший ..."
2,"Краснодар, Прикубанский внутригородской округ,...",LimeFit,1.0,Фитнес-клуб,"Не знаю смутят ли кого-то данные правила, но я..."
3,"Санкт-Петербург, проспект Энгельса, 111, корп. 1",Snow-Express,4.0,Пункт проката;Прокат велосипедов;Сапсёрфинг,Хорошие условия аренды. \nДружелюбный персонал...
4,"Тверь, Волоколамский проспект, 39",Студия Beauty Brow,5.0,"Салон красоты;Визажисты, стилисты;Салон бровей...",Топ мастер Ангелина топ во всех смыслах ) Немн...


In [None]:
labels = raw_data_set['rating'].unique() # возвращает объект типа np.ndarray

In [5]:
# Бьем на train и test, defaul shuffle = True

train_X, test_X, train_Y, test_Y = train_test_split(raw_data_set['text'], raw_data_set['rating'], train_size=0.3, test_size=0.1, random_state=RANDOM_SEED, stratify=raw_data_set['rating'])

In [None]:
# Токенизируем тексты
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
#tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny')
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

print(train_tokens.keys())
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[2, 301, 26923, 10361, 2554, 4781, 3689, 776, 9868, 16, 769, 327, 1580, 18946, 327, 2822, 1845, 18, 282, 27369, 1308, 14533, 4064, 17070, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
class CustomDataSet(Dataset):
    def __init__(self,
                 tokenized_text,
                 attention_mask,
                 lables):
        self.tokenized_text = torch.tensor(tokenized_text)
        self.attention_mask = torch.tensor(attention_mask)
        self.lables = torch.tensor(lables.to_numpy())

    def __len__(self):
        return len(self.lables)

    def __getitem__(self, idx):
        return self.tokenized_text[idx], self.attention_mask[idx], self.lables[idx]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [9]:
batch_size = 40
train_dataset = CustomDataSet(train_tokens['input_ids'], train_tokens['attention_mask'], train_Y)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = CustomDataSet(test_tokens['input_ids'], test_tokens['attention_mask'], test_Y)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased')

# model = AutoModel.from_pretrained("cointegrated/rubert-tiny") # Pre-trained model
optimizer = AdamW(model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

In [11]:
num_epochs = 3
model.to(device) # Transfer model to GPU if available

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29564, 312, padding_idx=0)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [12]:
import tqdm

train_losses = []
val_losses = []

train_losses_per_epoch = []
val_losses_per_epoch = []

for epoch in tqdm.tqdm(range(num_epochs)):
    # TRAINING BLOCK STARTS
    model.train()
    for batch in train_loader:

        tokenized_text, attention_mask, lables = batch
        tokenized_text = tokenized_text.to(device)
        attention_mask = attention_mask.to(device)
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = model(input_ids = tokenized_text, attention_mask = attention_mask)
        
        # The logits will be used for measuring the loss
        print(outputs)
        print(type(outputs))
        pred = outputs.logits()
        loss = loss_fn(pred, labels)

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_losses_per_epoch.append(loss.item())

    train_losses.append(np.mean(train_losses_per_epoch))
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            tokenized_text, attention_mask, lables = batch
            tokenized_text = tokenized_text.to(device)
            attention_mask = attention_mask.to(device)
            
            # We don't need gradients for testing
            outputs = model(input_ids = tokenized_text, attention_mask = attention_mask)
            
            # Logits act as predictions
            logits = outputs.logits()
            
            # Calculating total batch loss using the logits and labels
            loss = loss_fn(logits, labels)
            val_losses_per_epoch.append(loss.item())

    val_losses.append(np.mean(val_losses_per_epoch))
    val_losses_per_epoch = []
    train_losses_per_epoch = []

  0%|          | 0/3 [00:00<?, ?it/s]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.8006,  0.4384, -0.4541,  ...,  0.1432,  0.8869, -0.6016],
         [ 0.4312,  0.3154,  1.2680,  ...,  0.9073,  0.8899, -0.8920],
         [ 0.7814,  0.8540, -0.3996,  ...,  1.2783, -0.2659,  0.2536],
         ...,
         [ 1.6455,  0.7385, -0.7056,  ...,  0.1178,  0.8613, -0.9530],
         [ 1.8716,  0.7205, -1.0170,  ..., -0.1613,  0.4761, -0.7845],
         [ 2.1439,  0.5055, -1.1940,  ...,  0.2222,  0.4998, -1.1148]],

        [[ 0.8550,  0.0413, -0.3881,  ..., -0.6402,  0.6963, -1.1113],
         [ 0.3955,  0.3186, -1.5951,  ...,  1.0414,  1.6367, -0.5408],
         [ 1.7370, -0.1997, -1.2991,  ..., -0.8732,  1.3777, -2.5456],
         ...,
         [ 1.3235,  1.1098, -1.2342,  ...,  0.0195, -0.0293, -1.2398],
         [ 0.9693,  0.3221, -1.3393,  ..., -0.2275, -0.2551, -1.5850],
         [ 0.5231,  1.1316, -0.7996,  ...,  0.1560, -0.1005, -1.3076]],

        [[ 0.7217,  0.2962, -0.5099,  ...,  0.0921, -




AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'