In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import FloatTensor
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Глобальные константы, прописываем в отдельные преременные для удобства

RANDOM_SEED = 42
PATH_TO_RAW_DATA = os.path.join('data', 'raw', 'geo-reviews-dataset-2023.csv')
NUM_LABLES = 6
BATCH_SIZE = 40

In [3]:
# Загружаем датасэт

raw_data_set = pd.read_csv(PATH_TO_RAW_DATA)
print(f'Объем сырого датасэта: {len(raw_data_set)}')
raw_data_set.head(5)

Объем сырого датасэта: 500000


Unnamed: 0,address,name_ru,rating,rubrics,text
0,"Екатеринбург, ул. Московская / ул. Волгоградск...",Московский квартал,3.0,Жилой комплекс,Московский квартал 2.\nШумно : летом по ночам ...
1,"Московская область, Электросталь, проспект Лен...",Продукты Ермолино,5.0,Магазин продуктов;Продукты глубокой заморозки;...,"Замечательная сеть магазинов в общем, хороший ..."
2,"Краснодар, Прикубанский внутригородской округ,...",LimeFit,1.0,Фитнес-клуб,"Не знаю смутят ли кого-то данные правила, но я..."
3,"Санкт-Петербург, проспект Энгельса, 111, корп. 1",Snow-Express,4.0,Пункт проката;Прокат велосипедов;Сапсёрфинг,Хорошие условия аренды. \nДружелюбный персонал...
4,"Тверь, Волоколамский проспект, 39",Студия Beauty Brow,5.0,"Салон красоты;Визажисты, стилисты;Салон бровей...",Топ мастер Ангелина топ во всех смыслах ) Немн...


In [5]:
lables = raw_data_set['rating'].unique() # возвращает объект типа np.ndarray
lables

array([3., 5., 1., 4., 2., 0.])

In [5]:
# Бьем на train и test, defaul shuffle = True

train_X, test_X, train_Y, test_Y = train_test_split(raw_data_set['text'], raw_data_set['rating'], train_size=0.3, test_size=0.1, random_state=RANDOM_SEED, stratify=raw_data_set['rating'])

In [6]:
# Токенизируем тексты

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
#tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny')
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

print(train_tokens.keys())
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

: 

In [None]:
# Создаем оболочку для хранения и передачи в модель наших данных

class CustomDataSet(Dataset):
    def __init__(self,
                 tokenized_text,
                 attention_mask,
                 lables):
        self.tokenized_text = torch.tensor(tokenized_text)
        self.attention_mask = torch.tensor(attention_mask)
        self.lables = torch.tensor(lables.to_numpy())

    def __len__(self):
        return len(self.lables)

    def __getitem__(self, idx):
        return self.tokenized_text[idx], self.attention_mask[idx], self.lables[idx]

In [None]:
# Выбираем доступное место для наших вычислений

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# Инициализируем два необходимых даталоадера

train_dataset = CustomDataSet(train_tokens['input_ids'], train_tokens['attention_mask'], train_Y)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)

test_dataset = CustomDataSet(test_tokens['input_ids'], test_tokens['attention_mask'], test_Y)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [None]:
# Загружаем нужную модель

#model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=NUM_LABLES)
model = AutoModelForSequenceClassification.from_pretrained('answerdotai/ModernBERT-base', num_labels=NUM_LABLES)

# model = AutoModel.from_pretrained("cointegrated/rubert-tiny") # Pre-trained model
optimizer = AdamW(model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Узнаем как выглядят токены:

test_id = train_tokens['input_ids'][0][1]
with torch.no_grad():
    test_embedding = model.get_input_embeddings()

test_embedding

Embedding(28996, 768, padding_idx=0)

In [23]:
test_embedding.weight[test_id]

tensor([-7.7718e-02, -8.0116e-03, -7.8941e-03,  9.6660e-03, -1.0245e-01,
         1.6066e-02,  2.1629e-02,  8.2826e-03,  6.0495e-03, -1.9429e-02,
        -2.5278e-02, -8.5322e-02, -5.2908e-02, -2.8096e-02,  8.1911e-03,
         2.0108e-02, -4.6732e-02, -6.1263e-02, -1.0914e-01, -2.9416e-02,
        -3.9170e-02, -2.9461e-02,  3.4286e-02, -1.2243e-01, -4.7996e-02,
        -7.2637e-03,  2.6350e-02, -2.3867e-02, -2.2668e-02, -7.4090e-04,
        -7.7993e-02,  9.0500e-02,  1.4340e-02, -4.4981e-02, -5.3229e-02,
        -1.6184e-02, -5.2363e-02, -4.3146e-02,  2.1146e-03, -2.4799e-02,
         6.8474e-02, -3.5734e-02, -5.0536e-02, -3.5180e-02, -1.8313e-02,
         4.4827e-02, -7.1537e-02, -3.1704e-02, -8.4373e-02, -3.4681e-02,
        -4.0438e-02, -5.6349e-02, -5.8097e-02, -1.4534e-02, -6.6772e-02,
         4.2977e-02, -1.5955e-02, -2.1121e-02, -7.8810e-02, -1.1700e-02,
        -7.2549e-02, -2.7447e-02, -5.3368e-02, -6.0594e-03,  3.4049e-02,
        -7.0013e-02, -7.1642e-04, -2.4598e-02,  9.5

In [12]:
num_epochs = 3
model.to(device) # Transfer model to GPU if available

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# ЦИКЛ ОБУЧЕНИЯ

train_losses = []
val_losses = []

train_losses_per_epoch = []
val_losses_per_epoch = []

for epoch in tqdm.tqdm(range(num_epochs)):
    # активируем обучение модели
    model.train()
    for batch in tqdm.tqdm(train_loader):

        tokenized_text, attention_mask, lables = batch
        lables = lables.type(torch.LongTensor)
        tokenized_text, attention_mask, lables = tokenized_text.to(device), attention_mask.to(device), lables.to(device)
        
        # Сбрасываем градиенты с прошлых шагов
        optimizer.zero_grad()
        
        # forward
        outputs = model(input_ids = tokenized_text, attention_mask = attention_mask)
        
        # Получаем логиты и передаем их в функцию потерь
        pred = outputs.logits
        loss = loss_fn(pred, lables)

        # Обратное распростронение
        loss.backward()
        
        # Шаг оптимизатора
        optimizer.step()

        # Сохраняем значение функции потерь для статистики 
        train_losses_per_epoch.append(loss.item())

    train_losses.append(np.mean(train_losses_per_epoch))

    #Замораживаем обучение
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            tokenized_text, attention_mask, lables = batch
            lables = lables.type(torch.LongTensor)
            tokenized_text, attention_mask, lables = tokenized_text.to(device), attention_mask.to(device), lables.to(device)
            
            # forward
            outputs = model(input_ids = tokenized_text, attention_mask = attention_mask)
            
            # Получаем логиты и передаем их в функцию потерь
            logits = outputs.logitsbels
            loss = loss_fn(logits, lables)

            # Сохраняем значение функции потерь для статистики
            val_losses_per_epoch.append(loss.item())

    val_losses.append(np.mean(val_losses_per_epoch))
    val_losses_per_epoch = []
    train_losses_per_epoch = []

  0%|          | 0/3 [00:00<?, ?it/s]

*




*




*




*




*




*




*




*


  0%|          | 7/3750 [01:23<12:21:05, 11.88s/it]
  0%|          | 0/3 [01:23<?, ?it/s]


KeyboardInterrupt: 