In [8]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, get_scheduler, AutoModelForSequenceClassification
from datasets import Dataset
import numpy as np
from evaluate import load as load_metric
import numpy as np
from transformers import pipeline
import pandas as pd
from sklearn.metrics import classification_report

from typing import List, Dict

In [9]:
tags = ['NON_FOOD_GOODS', 'SERVICE', 'FOOD_GOODS', 'LOAN', 'REALE_STATE',
       'BANK_SERVICE', 'NOT_CLASSIFIED', 'TAX', 'LEASING']

tag2ind = {}
for index, tag in enumerate(tags):
    tag2ind[tag] = index

ind2tag = {}
for key, value in tag2ind.items():
    ind2tag[value] = key

In [10]:
train = pd.read_csv('./flair_data/train.csv', sep='\t', header=None, names=["text", "label"])
dev = pd.read_csv('./flair_data/dev.csv', sep='\t', header=None, names=["text", "label"])
test = pd.read_csv('./flair_data/test.csv', sep='\t', header=None, names=["text", "label"])

train = pd.concat([train, dev]).reset_index(drop=True)

In [11]:
train["label"].replace(tag2ind, inplace=True)
test["label"].replace(tag2ind, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["label"].replace(tag2ind, inplace=True)
  train["label"].replace(tag2ind, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["label"].replace(tag2ind, inplace=True)
  test["label"].replace(tag2ind, inplace=True)


In [12]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [6]:
model_name = "deepvk/USER-bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name, tag2ind=tag2ind, ind2tag=ind2tag)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=9)

You are using a model of type xlm-roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepvk/USER-bge-m3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=256, padding="max_length", truncation=True)

In [8]:
tokenized_train = train.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)

Map:   0%|          | 0/4248 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
batch_size = 32
epochs = 10

In [10]:
training_args = TrainingArguments(
    output_dir=f"./checkpoint",
    learning_rate=5.0e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    lr_scheduler_type="linear",   # Specify the type of scheduler
    warmup_ratio=0.1   
)



In [11]:


metric = load_metric("f1", "micro")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='micro')

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.10421,0.954
2,No log,0.03067,0.982
3,No log,0.016562,0.996
4,0.187500,0.003918,0.998
5,0.187500,0.000931,1.0
6,0.187500,0.002284,0.998
7,0.187500,0.002655,0.998
8,0.019100,0.003154,0.998
9,0.019100,0.002328,0.998
10,0.019100,0.002542,0.998


TrainOutput(global_step=1330, training_loss=0.07999042227752226, metrics={'train_runtime': 1482.6054, 'train_samples_per_second': 28.652, 'train_steps_per_second': 0.897, 'total_flos': 1.979469032067072e+16, 'train_loss': 0.07999042227752226, 'epoch': 10.0})

In [19]:
predictions = trainer.predict(tokenized_test)

# Extract predicted labels and true labels
predicted_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

In [21]:
report = classification_report(true_labels, predicted_labels, target_names=ind2tag.values())

In [23]:
print(report)

                precision    recall  f1-score   support

NON_FOOD_GOODS       1.00      1.00      1.00        96
       SERVICE       1.00      1.00      1.00        88
    FOOD_GOODS       1.00      1.00      1.00        90
          LOAN       1.00      1.00      1.00        41
   REALE_STATE       1.00      1.00      1.00        27
  BANK_SERVICE       1.00      1.00      1.00        49
NOT_CLASSIFIED       1.00      1.00      1.00        23
           TAX       1.00      1.00      1.00        48
       LEASING       1.00      1.00      1.00        38

      accuracy                           1.00       500
     macro avg       1.00      1.00      1.00       500
  weighted avg       1.00      1.00      1.00       500



In [24]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.0009310277528129518, 'eval_f1': 1.0, 'eval_runtime': 5.2231, 'eval_samples_per_second': 95.728, 'eval_steps_per_second': 3.063, 'epoch': 10.0}


In [25]:
model.save_pretrained("./result")
tokenizer.save_pretrained("./result")

('./result/tokenizer_config.json',
 './result/special_tokens_map.json',
 './result/sentencepiece.bpe.model',
 './result/added_tokens.json',
 './result/tokenizer.json')

In [35]:
classifier = pipeline("text-classification", model="./result", tokenizer="./result", )

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [37]:
classifier(["Услуга", "Непонятно"])

[{'label': 'LABEL_1', 'score': 0.9960511326789856},
 {'label': 'LABEL_7', 'score': 0.45906907320022583}]

In [None]:
model_name = "your_model_name_or_directory"  # Replace with your model's path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to CUDA if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Function to predict labels for a list of strings with batching
def predict(texts: List[str], batch_size: int = 16):
    model.eval()
    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch
        inputs = tokenizer(batch_texts, padding='longest', truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy())

    return all_predictions