In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import pandas as pd

In [12]:
df = pd.read_csv('/content/train_dataset_train.csv', sep=";")

In [13]:
df['Текст инцидента'] = df['Текст инцидента'].str.replace("'", '')
df.drop_duplicates(subset="Текст инцидента", keep="last", inplace=True)
df.dropna(subset=['Текст инцидента'], inplace=True)


In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['Тема'] = label_encoder.fit_transform(df['Тема'])

In [15]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [17]:
# Создаем класс для Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [18]:
MAX_LEN = 128
BATCH_SIZE = 16

In [19]:
# Создаем DataLoader
train_dataset = CustomDataset(
    texts=train_df['Текст инцидента'].values,
    labels=train_df['Тема'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [20]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [21]:
val_dataset = CustomDataset(
    texts=val_df['Текст инцидента'].values,
    labels=val_df['Тема'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [22]:
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [23]:
class CustomClassifier(torch.nn.Module):
    def __init__(self, num_labels):
        super(CustomClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(last_hidden_state)
        logits = self.classifier(pooled_output)
        return logits

In [24]:
model = CustomClassifier(num_labels=len(df['Тема'].unique()))

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

In [25]:
EPOCHS = 3
LEARNING_RATE = 2e-5

In [26]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CustomClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [28]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7c16e72c51e0>

In [29]:
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Выводим train_loss
    print(f'Train Loss: {train_loss / len(train_loader)}')

    # Валидация модели
    model.eval()
    val_loss = 0.0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch + 1} Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())


    # Вычисляем F1-score на валидационной выборке
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Выводим val_loss и F1-score
    print(f'Validation Loss: {val_loss / len(val_loader)}')
    print(f'Validation F1-score: {f1}')


Epoch 1:   0%|          | 0/1133 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train Loss: 3.244902909696365


Epoch 1 Validation:   0%|          | 0/284 [00:00<?, ?it/s]

Validation Loss: 2.447556090187019
Validation F1-score: 0.3593219271989012


Epoch 2:   0%|          | 0/1133 [00:00<?, ?it/s]



Train Loss: 2.2163626472190323


Epoch 2 Validation:   0%|          | 0/284 [00:00<?, ?it/s]

Validation Loss: 2.104770845930341
Validation F1-score: 0.41957264327671756


Epoch 3:   0%|          | 0/1133 [00:00<?, ?it/s]



Train Loss: 1.791587312139459


Epoch 3 Validation:   0%|          | 0/284 [00:00<?, ?it/s]

Validation Loss: 1.979444379957629
Validation F1-score: 0.4620072037843279


In [31]:
torch.save(model.state_dict(), 'model.pth')

In [40]:
from transformers import BertTokenizer
from torch.nn.functional import softmax

model = CustomClassifier(num_labels=len(df['Тема'].unique()))
model.load_state_dict(torch.load('model.pth'))  # Укажите путь к вашей сохраненной модели
model.eval()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def predict_category(text, model, tokenizer):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    with torch.no_grad():
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        outputs = model(input_ids, attention_mask)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)

    predicted_category_index = torch.argmax(probabilities).item()
    predicted_category = label_encoder.classes_[predicted_category_index]

    return predicted_category



input_text = "Почему нет тепла по улице нейвинская до 10? Мы мерзнем уже две недели! Махонин, падла, верни деньги за отопление!"
predicted_category = predict_category(input_text, model, tokenizer)
print(f"Predicted Category: {predicted_category}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted Category: Ненадлежащее качество или отсутствие отопления
