In [2]:
!pip install transformers spacy datasets tokenizers accelerate > null

In [3]:
# импортируем библиотеки
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import datasets
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from sklearn.metrics import f1_score

# 1. Подготовка данных

In [3]:
# Загрузим наши данные
dataset = datasets.load_dataset("sberquad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [4]:
# Посмотрим на данные
dataset['train']['context'][0]

'В протерозойских отложениях органические остатки встречаются намного чаще, чем в архейских. Они представлены известковыми выделениями сине-зелёных водорослей, ходами червей, остатками кишечнополостных. Кроме известковых водорослей, к числу древнейших растительных остатков относятся скопления графито-углистого вещества, образовавшегося в результате разложения Corycium enigmaticum. В кремнистых сланцах железорудной формации Канады найдены нитевидные водоросли, грибные нити и формы, близкие современным кокколитофоридам. В железистых кварцитах Северной Америки и Сибири обнаружены железистые продукты жизнедеятельности бактерий.'

In [5]:
# Загрузим наш токенайзер
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [6]:
# Определим функцию, которая предназначена для предобработки обучающих данных для модели
max_length = 356 # длина последовательности

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]] #инициализация вопросов

    # токенезируем вопросы и контексты
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # извлечение данных
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Находим начало и конец контекста
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Если ответ не полностью в контексте, то меткой будет (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    #обновим входные данные
    inputs['input_ids'] = inputs['input_ids']
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
# Применим нашу функцию
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
print(len(dataset["train"]), len(train_dataset))

valid_dataset = dataset["validation"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
print(len(dataset["validation"]), len(valid_dataset))

test_dataset = dataset["test"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
print(len(dataset["test"]), len(test_dataset))

45328 45620


Map:   0%|          | 0/5036 [00:00<?, ? examples/s]

5036 5071
23936 24079


In [8]:
# создадим объекты DataLoader
dataloaders = {
    'train': DataLoader(dataset=list(zip(
            train_dataset['input_ids'],
            train_dataset['attention_mask'],
            train_dataset['start_positions'],
            train_dataset['end_positions'])),
                        batch_size=32),
    'valid': DataLoader(dataset=list(zip(
            valid_dataset['input_ids'],
            valid_dataset['attention_mask'],
            valid_dataset['start_positions'],
            valid_dataset['end_positions'])),
                        batch_size=32),
    'test': DataLoader(dataset=list(zip(
            test_dataset['input_ids'],
            test_dataset['attention_mask'],
            test_dataset['start_positions'],
            test_dataset['end_positions'])),
                       batch_size=32)
}

In [9]:
# проведем итерацию по пакетам данных и преобразуем списки в тензоры
for input_ids, attention_mask, start_positions, end_positions in dataloaders['train']:
  input_ids = torch.stack(input_ids).T
  attention_mask = torch.stack(attention_mask).T
  print(input_ids.shape, attention_mask.shape, start_positions.shape, end_positions.shape)
  break

torch.Size([32, 356]) torch.Size([32, 356]) torch.Size([32]) torch.Size([32])


In [10]:
# определим модель
class LSTMTransformerQA(nn.Module):
    def __init__(self, backbone_name, hidden_dim, dropout=.1, freeze_backbone=True):
        super(LSTMTransformerQA, self).__init__()
        self.backbone = AutoModel.from_pretrained(backbone_name)
        self.lstm = nn.LSTM(768, hidden_dim, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, 2)  # 2 for start and end position predictions
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        if freeze_backbone:
          for param in self.backbone.parameters():
            param.requires_grad = False

        print("{} learnable parameters".format(
            sum(p.numel() for p in self.parameters() if p.requires_grad))
        )

    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        input_embs = self.backbone(input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_out, _ = self.lstm(input_embs)
        logits = self.fc(lstm_out)

        return logits

In [11]:
# создадим экземпляр нашей модели
model = LSTMTransformerQA('DeepPavlov/rubert-base-cased', 256)
output = model(input_ids, attention_mask)
output.shape

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2102274 learnable parameters


torch.Size([32, 356, 2])

# 2. Обучение модели

In [12]:
# определим параметры
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [13]:
# обучим модель
history = []
for epoch in range(3):
    epoch_metrics = {
        'train': {'loss': []},
        'valid': {'loss': []}
    }
    for phase in ['train', 'valid']:
      if phase == 'valid':
        model.eval()
      else:
        model.train()
      for input_ids, attention_mask, start_positions, end_positions in tqdm(dataloaders[phase]):
          input_ids = torch.stack(input_ids).T.to(model.device)
          attention_mask = torch.stack(attention_mask).T.to(model.device)
          start_positions, end_positions = start_positions.to(model.device), end_positions.to(model.device)
          optimizer.zero_grad()
          logits = model(input_ids, attention_mask)
          start_logits, end_logits = logits.split(1, dim=-1)
          start_logits = start_logits.squeeze(-1).contiguous()
          end_logits = end_logits.squeeze(-1).contiguous()

          start_loss = criterion(start_logits, start_positions)
          end_loss = criterion(end_logits, end_positions)
          loss = (start_loss + end_loss) / 2

          score_start = (start_positions == start_logits.argmax(1)).sum().detach().item()
          score_end = (end_positions == end_logits.argmax(1)).sum().detach().item()

          epoch_metrics[phase]['loss'].append(loss.item())
          if phase == 'train':
            loss.backward()
            optimizer.step()
      scheduler.step()
      epoch_metrics[phase]['loss'] = np.mean(epoch_metrics[phase]['loss'])
      print("Epoch {0}/{1}, Phase {2}, Loss: {3}, lr: {4}".format(
          epoch+1, 3, phase, epoch_metrics[phase]['loss'],
          scheduler.get_last_lr()[0]))

    history.append(epoch_metrics)


# сохраним модель
torch.save(model.state_dict(), 'lstm_qa_model.pth')

100%|██████████| 1426/1426 [18:41<00:00,  1.27it/s]


Epoch 1/3, Phase train, Loss: 3.807607651258418, lr: 0.009000000000000001


100%|██████████| 159/159 [01:58<00:00,  1.34it/s]


Epoch 1/3, Phase valid, Loss: 3.594967912577983, lr: 0.008100000000000001


100%|██████████| 1426/1426 [18:41<00:00,  1.27it/s]


Epoch 2/3, Phase train, Loss: 3.6500518082903612, lr: 0.007290000000000001


100%|██████████| 159/159 [01:58<00:00,  1.34it/s]


Epoch 2/3, Phase valid, Loss: 3.5364793981396176, lr: 0.006561000000000002


100%|██████████| 1426/1426 [18:43<00:00,  1.27it/s]


Epoch 3/3, Phase train, Loss: 3.5724818890425802, lr: 0.005904900000000002


100%|██████████| 159/159 [01:58<00:00,  1.34it/s]


Epoch 3/3, Phase valid, Loss: 3.4879960114101194, lr: 0.005314410000000002


In [14]:
# напишем функцию для оценки модели на метрике f1
def evaluate(model, dataloader):
  f1_scores = []
  for input_ids, attention_mask, start_positions, end_positions in tqdm(dataloader):
    input_ids = torch.stack(input_ids).T.to(model.device)
    attention_mask = torch.stack(attention_mask).T.to(model.device)
    start_positions, end_positions = start_positions.to(model.device), end_positions.to(model.device)
    logits = model(input_ids, attention_mask)
    start_logits, end_logits = logits.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1).contiguous()
    end_logits = end_logits.squeeze(-1).contiguous()

    pred_start_positions = start_logits.argmax(1)
    pred_end_positions = end_logits.argmax(1)

    true_seqs = np.zeros_like(input_ids.detach().cpu())
    pred_seqs = np.zeros_like(input_ids.detach().cpu())

    for i in range(true_seqs.shape[0]):
      true_seqs[i][start_positions[i]:end_positions[i]+1] = 1
      pred_seqs[i][pred_start_positions[i]:pred_end_positions[i]+1] = 1

    f1_score_mean = np.mean([f1_score(
        true_seqs[i], pred_seqs[i]) for i in range(len(true_seqs))]
    )
    f1_scores.append(f1_score_mean)
  return np.mean(f1_scores)

In [15]:
# оценим модель
score = evaluate(model, dataloaders['valid'])
print(score)

100%|██████████| 159/159 [02:03<00:00,  1.29it/s]

0.1475757900352829





In [16]:
print(score)

0.1475757900352829


# 3. Попробуем еще один вариант

In [4]:
# Загрузим данные
dataset = datasets.load_dataset("sberquad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [5]:
# # Загрузим наш токенайзер
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [6]:
# Определим функцию, которая предназначена для предобработки обучающих данных для модели
max_length = 356

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['input_ids'] = inputs['input_ids']
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
# Применим нашу функцию
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
print(len(dataset["train"]), len(train_dataset))

valid_dataset = dataset["validation"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
print(len(dataset["validation"]), len(valid_dataset))

test_dataset = dataset["test"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
print(len(dataset["test"]), len(test_dataset))

45328 45620


Map:   0%|          | 0/5036 [00:00<?, ? examples/s]

5036 5071
23936 24079


In [8]:
# Создадим нашу модель
model = AutoModelForQuestionAnswering.from_pretrained('DeepPavlov/rubert-base-cased')
model.to('cuda')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

# 4. Обучим нашу модель

In [11]:

training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate= 0.0001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=3,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.892,1.783962
2,1.5416,1.786696
3,0.7963,2.028631


TrainOutput(global_step=8556, training_loss=1.4517544424940683, metrics={'train_runtime': 10013.6472, 'train_samples_per_second': 13.667, 'train_steps_per_second': 0.854, 'total_flos': 2.486512185385248e+16, 'train_loss': 1.4517544424940683, 'epoch': 3.0})

In [12]:
# подключимся к аккаунту
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
# сохраним модель
model.push_to_hub("Mikhail1313/sberquad_rubert_basee")

pytorch_model.bin:   0%|          | 0.00/709M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mikhail1313/sberquad_rubert_basee/commit/0eb4546d96419d992587ea6bedbd2881b605d9ab', commit_message='Upload BertForQuestionAnswering', commit_description='', oid='0eb4546d96419d992587ea6bedbd2881b605d9ab', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
# загрузим модель
model = AutoModelForQuestionAnswering.from_pretrained("Mikhail1313/sberquad_rubert_basee")
model.to('cuda')

Downloading (…)lve/main/config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/709M [00:00<?, ?B/s]

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [22]:
# функция для вычисления метрики
def evaluate(model, dataset):
  f1_scores = []

  dataloader = DataLoader(dataset=list(
      zip(
            dataset['input_ids'],
            dataset['attention_mask'],
            dataset['start_positions'],
            dataset['end_positions']
            )
      ), batch_size=16
  )

  for input_ids, attention_mask, start_positions, end_positions in dataloader:
    input_ids = torch.stack(input_ids).T.to(model.device)
    attention_mask = torch.stack(attention_mask).T.to(model.device)
    start_positions, end_positions = start_positions.to(model.device), end_positions.to(model.device)
    logits = model(input_ids, attention_mask)

    pred_start_positions = logits.start_logits.argmax(1)
    pred_end_positions = logits.end_logits.argmax(1)

    true_seqs = np.zeros_like(input_ids.detach().cpu())
    pred_seqs = np.zeros_like(input_ids.detach().cpu())

    for i in range(true_seqs.shape[0]):
      true_seqs[i][start_positions[i]:end_positions[i]+1] = 1
      pred_seqs[i][pred_start_positions[i]:pred_end_positions[i]+1] = 1

    f1_score_mean = np.mean([f1_score(
        true_seqs[i], pred_seqs[i]) for i in range(len(true_seqs))]
    )
    f1_scores.append(f1_score_mean)
  return np.mean(f1_scores)

In [23]:
# посчитаем нашу метрику
score = evaluate(model, valid_dataset)
print(score)

0.11332677319313657


# 6. Вывод
В ходе выполнения задания мы применили 2 разных способа которые не дали ожидаемого результата. Из-за того что постоянно не хватает мощностей для вычисления, эксперементы по обучению прекращаем. Суть задания и построения моделей на основе QA ясна.
Были проведены десятки эксперементов, применено 6 различных, готовых моделей из библиотеки huggingface, при обучении постоянно нехватало памяти, уменьшение выборки так же не помогала, либо модель училась очень долго и кагл прекращал доступ