In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [1]:
import glob
import json

files = glob.glob(r"/content/drive/MyDrive/Coursework/Q_A/*.json")

In [2]:
texts = []
labels = []

def concat_data(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for question, anss in data.items():
        for i in range(len(anss)):
            texts.append(question + '||||' + anss[i])
            labels.append(i)
            
for file in files[:50]:
    concat_data(file)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

class TextLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.labels = labels
        self.tokenizer = tokenizer
        self.encodings = tokenizer(texts, max_length=64, return_tensors="pt", padding=True, truncation=True)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item



tokenizer = BertTokenizerFast.from_pretrained("cointegrated/rubert-tiny")

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.25, random_state=42)

train_dataset = TextLabelDataset(train_texts, train_labels, tokenizer)
val_dataset = TextLabelDataset(val_texts, val_labels, tokenizer)
test_dataset = TextLabelDataset(test_texts, test_labels, tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

In [6]:
num_labels = len(set(labels))

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss
1,1.4749,1.462001
2,1.3885,1.462373
3,1.3864,1.456246


TrainOutput(global_step=1647, training_loss=1.4408716017648389, metrics={'train_runtime': 244.8289, 'train_samples_per_second': 53.793, 'train_steps_per_second': 6.727, 'total_flos': 433197132514560.0, 'train_loss': 1.4408716017648389, 'epoch': 3.0})

In [7]:
y_true = []
for i in test_dataset:
  y_true.append(i['labels'].detach().cpu().numpy())

In [8]:
import numpy
result = numpy.argmax(trainer.predict(test_dataset).predictions, axis=1)

In [9]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

# Расчет метрик F1, accuracy и NDCG
f1 = f1_score(y_true, result, average='weighted')
accuracy = accuracy_score(y_true, result)
recall = recall_score(y_true, result, average='weighted')

print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
from transformers import BertTokenizerFast

F1 Score: 0.3071
Accuracy: 0.4761
Recall: 0.4761


In [10]:
from transformers import BertConfig

config = BertConfig.from_json_file("/content/drive/MyDrive/Coursework/config.json")
own_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Coursework/pytorch_model.bin", config=config)

Some weights of the model checkpoint at /content/drive/MyDrive/Coursework/pytorch_model.bin were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpo

In [11]:
own_model.num_labels = num_labels
own_model.classifier = torch.nn.Linear(in_features=768, out_features=num_labels)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
own_model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(own_model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * 3)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    own_model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = own_model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()


    # Validation loop
    own_model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = own_model(**batch)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch: {epoch + 1}, Validation Loss: {avg_val_loss}")




Epoch: 1, Validation Loss: 1.5290464003229403
Epoch: 2, Validation Loss: 1.4562860163834577
Epoch: 3, Validation Loss: 1.4519833319825552


In [30]:
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

own_model.eval()
preds = []

with torch.no_grad():
  for batch in test_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = own_model(**batch)
      preds.append(torch.argmax(outputs.logits, dim=1).detach().cpu().numpy())

In [31]:
y_pred = []
for pred in preds:
  for pr in pred:
    y_pred.append(pr)

In [32]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

# Расчет метрик F1, accuracy и recall
f1 = f1_score(y_true, y_pred, average='weighted')
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average='weighted')

print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"recall: {recall:.4f}")

F1 Score: 0.3071
Accuracy: 0.4761
recall: 0.4761


In [52]:
print(f'Fragment of text: {test_texts[3][1600:1800]}')
print(f'True label: {test_labels[3]}')
print(f'Own model label: {y_pred[3]}')
print(f'BERT model label: {result[3]}')

Fragment of text: срок исполнения обязательства определен моментом востребования, то неустойку начисляют по истечении семи дней с даты предъявления кредитором требования о его исполнении (п. 2 ст. 314 ГК РФ).Разумеется
True label: 0
Own model label: 0
BERT model label: 0
