In [1]:
!pip install transformers --upgrade >> installations.log

[0m

In [8]:
from datasets import load_dataset, load_metric
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
datasets = load_dataset("squad_v2" if squad_v2 else "squad")


Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict


class QADataset:
    def __init__(self, train_data, val_data):
        self.train_examples = self.create_qa_example(train_data)
        self.val_examples = self.create_qa_example(val_data)
        self.train_dataset = Dataset.from_pandas(pd.DataFrame(self.train_examples))
        self.val_dataset = Dataset.from_pandas(pd.DataFrame(self.val_examples))
        self.dataset_dict = DatasetDict({
            'train': self.train_dataset,
            'validation': self.val_dataset
        })

    def create_qa_example(self, data):
        examples = []
        for row in data:
            text = row['text']
            question = row['label']
            extracted_part = row.get('extracted_part', {})
            if extracted_part and 'text' in extracted_part:
                answer = extracted_part['text'][0].strip()
                answer_start = extracted_part['answer_start'][0]
                answer_end = extracted_part['answer_end'][0]
            else:
                answer = answer_start = answer_end = None

            example = {'context': text, 'question': question, 'answer': answer, 'answer_start': answer_start, 'answer_end': answer_end}
            examples.append(example)
        return examples


In [35]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=150,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer_start = examples["answer_start"][sample_index]
        if answer_start == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answer_start
            end_char = examples["answer_end"][sample_index]
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
                
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [38]:
model_name = "valhalla/longformer-base-4096-finetuned-squadv1"
max_length = 4000
pad_on_right = tokenizer.padding_side == "right"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = qa_dataset.dataset_dict.map(prepare_train_features, batched=True, 
                                                 remove_columns=qa_dataset.dataset_dict["train"].column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
len(tokenized_dataset['train']['attention_mask'][6])

4000

In [24]:
tokenized_datasets['validation']

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 10570
})

In [32]:
qa_dataset = QADataset(train_data, val_data)
qa_dataset.dataset_dict

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'answer_start', 'answer_end'],
        num_rows: 1439
    })
    validation: Dataset({
        features: ['context', 'question', 'answer', 'answer_start', 'answer_end'],
        num_rows: 360
    })
})

In [28]:
train_data[0]

{'id': 711430708,
 'text': 'УТВЕРЖДАЮ: Председатель закупочной комиссии, заместитель генерального директора - по логистике и МТО АО «АТХ» ____________________ Т.Ю. Шустова «01» сентября 2022 г. ДОКУМЕНТАЦИЯ О КОНКУРЕНТНОЙ ЗАКУПКЕ ЗАПРОС ПРЕДЛОЖЕНИЙ В ЭЛЕКТРОННОЙ ФОРМЕ, УЧАСТНИКАМИ КОТОРОГО МОГУТ БЫТЬ ТОЛЬКО СУБЪЕКТЫ МАЛОГО И СРЕДНЕГО ПРЕДПРИНИМАТЕЛЬСТВА на право заключения Договора на выполнение работ по ремонту зданий и сооружений г. Киров 2022 год. Стр.2 СОДЕРЖАНИЕ СОДЕРЖАНИЕ 2 I. ОБЩИЕ УСЛОВИЯ ПРОВЕДЕНИЯ закупки 3 1. ОБЩИЕ ПОЛОЖЕНИЯ 3 1.1. Правовой статус документов 3 1.2. Заказчик, предмет и условия проведения закупки. 3 1.3. Начальная (максимальная) цена договора 4 1.4. Требования к участникам закупки 4 1.5. Участие в закупке коллективных участников (группы лиц) 5 1.6. Привлечение соисполнителей (субподрядчиков) к исполнению договора 6 1.7. Расходы на участие в закупке и при заключении договора 7 1.8. Предоставление приоритетов товаров российского происхождения, работ, услуг, выпо

In [2]:
!pip list | grep transformers

transformers                           4.27.4


In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, get_linear_schedule_with_warmup, LongformerTokenizerFast
from transformers import BertTokenizer, BertForQuestionAnswering, TFAutoModelForQuestionAnswering, TFLongformerForQuestionAnswering
from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering, BigBirdPegasusForQuestionAnswering, PegasusTokenizer, PegasusTokenizerFast
import torch
import time
import nltk
from torch.nn.utils.rnn import pad_sequence
import math
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
import json

import tensorflow as tf
import re
from torch.nn.utils.rnn import pad_sequence
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import LongformerConfig

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, LongformerConfig
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator


class QATrainer:
    def __init__(self, model_name, train_dataset, val_dataset, batch_size=16, epochs=3):
        self.model_name = model_name
        self.config = LongformerConfig()
#         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.batch_size = batch_size
        self.epochs = epochs
    
    def training(self):
        model_name = self.model_name.split("/")[-1]
        args = TrainingArguments(
            model_name,
            evaluation_strategy = "epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            push_to_hub=True,
        )

        data_collator = default_data_collator
        trainer = Trainer(
            self.model,
            args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )
        
        trainer.train()
        trainer.save_model("QA-trained")
        
        return self.model


In [21]:
# загружаем данные для обучения
with open('/kaggle/input/nlp-test-task-2023/nlp_test_task_2023/dataset/train.json', 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# загружаем данные для предсказания
with open('/kaggle/input/nlp-test-task-2023/nlp_test_task_2023/dataset/test.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

# Разбиваем данные на обучающую и валидационную выборки
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [51]:
import pandas as pd

def create_dataframe(data, fields, subfields):
    main_df = pd.DataFrame(data)[fields]
    sub_df_list = []
    for subfield in subfields:
        sub_df = pd.DataFrame(list(main_df[subfield]))
        sub_df.columns = [f"{subfield}_{col}" for col in sub_df.columns]
        sub_df_list.append(sub_df)
    main_df = main_df.drop(columns=['extracted_part'])
    return pd.concat([main_df] + sub_df_list, axis=1)


In [8]:
train_df = create_dataframe(train_data, ['text', 'label', "extracted_part"], ['extracted_part'])
display(HTML(train_df[:2].to_html()))

Unnamed: 0,text,label,extracted_part_text,extracted_part_answer_start,extracted_part_answer_end
0,"УТВЕРЖДАЮ: Председатель закупочной комиссии, заместитель генерального директора - по логистике и МТО АО «АТХ» ____________________ Т.Ю. Шустова «01» сентября 2022 г. ДОКУМЕНТАЦИЯ О КОНКУРЕНТНОЙ ЗАКУПКЕ ЗАПРОС ПРЕДЛОЖЕНИЙ В ЭЛЕКТРОННОЙ ФОРМЕ, УЧАСТНИКАМИ КОТОРОГО МОГУТ БЫТЬ ТОЛЬКО СУБЪЕКТЫ МАЛОГО И СРЕДНЕГО ПРЕДПРИНИМАТЕЛЬСТВА на право заключения Договора на выполнение работ по ремонту зданий и сооружений г. Киров 2022 год. Стр.2 СОДЕРЖАНИЕ СОДЕРЖАНИЕ 2 I. ОБЩИЕ УСЛОВИЯ ПРОВЕДЕНИЯ закупки 3 1. ОБЩИЕ ПОЛОЖЕНИЯ 3 1.1. Правовой статус документов 3 1.2. Заказчик, предмет и условия проведения закупки. 3 1.3. Начальная (максимальная) цена договора 4 1.4. Требования к участникам закупки 4 1.5. Участие в закупке коллективных участников (группы лиц) 5 1.6. Привлечение соисполнителей (субподрядчиков) к исполнению договора 6 1.7. Расходы на участие в закупке и при заключении договора 7 1.8. Предоставление приоритетов товаров российского происхождения, работ, услуг, выполняемых, оказываемых российс 3.5.2, 5.6.4 Закупка по единичным расценкам Нет.",обеспечение гарантийных обязательств,[],[0],[0]
1,"УТВЕРЖДАЮ Начальник государственного бюджетного учреждения Волгоградской области «Волгоградская городская станция по борьбе с болезнями животных» __________________ В.Н. Рудников "" "" 2022 г. М.П. ИЗВЕЩЕНИЕ О ПРОВЕДЕНИИ ЗАПРОСА КОТИРОВОК В ЭЛЕКТРОННОЙ УТВЕРЖДАЮ Начальник государственного бюджетного учреждения Волгоградской области «Волгоградская городская станция по борьбе с болезнями животных» __________________ В.Н. Рудников "" "" 2022 г. М.П. ИЗВЕЩЕНИЕ О ПРОВЕДЕНИИ ЗАПРОСА КОТИРОВОК В ЭЛЕКТРОННОЙ ФОРМЕ, УЧАСТНИКАМИ КОТОРОГО МОГУТ БЫТЬ ТОЛЬКО СУБЪЕКТЫ МАЛОГО И СРЕДНЕГО ПРЕДПРИНИМАТЕЛЬСТВА на поставку расходных материалов № п/п Наименование Сведения 1 Используемый способ закупки Запрос котировок в электронной форме (далее – запрос котировок) 2 Информация о Заказчике (контактная информация) Наименование: Государственное бюджетное учреждение Волгоградской области «Волгоградская городская станция по борьбе с болезнями животных» (ГБУ ВО «Волгоградская горСББЖ») Почтовый адрес: 400107, Волгоградская обл., г. Волгоград, ул. Карла Либкнехта, 6 Адрес электронной почты: zakupki-vet@mail.ru Номер контактного телефона: +7 9610675527 Ответственное должностное лицо Заказчика: Шкитина Ольга Николаевна 3 Адрес электронной площадки в сети Интернет w 22",обеспечение гарантийных обязательств,[],[0],[0]


In [11]:
def create_qa_examples(data):
    examples = []
    for row in data:
        text = row['text']
        question = row['label']
        extracted_part = row.get('extracted_part', {})
        if extracted_part and 'text' in extracted_part:
            answer = extracted_part['text'][0].strip()
            answer_start = extracted_part['answer_start'][0]
            answer_end = extracted_part['answer_end'][0]
        else:
            answer = answer_start = answer_end = None

        example = {'context': text, 'question': question, 'answer': answer, 'answer_start': answer_start, 'answer_end': answer_end}
        examples.append(example)
    return examples

In [14]:
create_qa_examples(train_data)[0]['context']

'УТВЕРЖДАЮ: Председатель закупочной комиссии, заместитель генерального директора - по логистике и МТО АО «АТХ» ____________________ Т.Ю. Шустова «01» сентября 2022 г. ДОКУМЕНТАЦИЯ О КОНКУРЕНТНОЙ ЗАКУПКЕ ЗАПРОС ПРЕДЛОЖЕНИЙ В ЭЛЕКТРОННОЙ ФОРМЕ, УЧАСТНИКАМИ КОТОРОГО МОГУТ БЫТЬ ТОЛЬКО СУБЪЕКТЫ МАЛОГО И СРЕДНЕГО ПРЕДПРИНИМАТЕЛЬСТВА на право заключения Договора на выполнение работ по ремонту зданий и сооружений г. Киров 2022 год. Стр.2 СОДЕРЖАНИЕ СОДЕРЖАНИЕ 2 I. ОБЩИЕ УСЛОВИЯ ПРОВЕДЕНИЯ закупки 3 1. ОБЩИЕ ПОЛОЖЕНИЯ 3 1.1. Правовой статус документов 3 1.2. Заказчик, предмет и условия проведения закупки. 3 1.3. Начальная (максимальная) цена договора 4 1.4. Требования к участникам закупки 4 1.5. Участие в закупке коллективных участников (группы лиц) 5 1.6. Привлечение соисполнителей (субподрядчиков) к исполнению договора 6 1.7. Расходы на участие в закупке и при заключении договора 7 1.8. Предоставление приоритетов товаров российского происхождения, работ, услуг, выполняемых, оказываемых россий

In [98]:
from transformers import AutoTokenizer, RobertaTokenizer, LongformerTokenizer, LongformerForQuestionAnswering, LongformerTokenizer
import transformers
max_seq_length = 4000
model_name = "valhalla/longformer-base-4096-finetuned-squadv1"
# LongformerTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = LongformerConfig.from_pretrained('valhalla/longformer-base-4096-finetuned-squadv1')
config.attention_mode = 'sliding_chunks'

num_epochs = 3
batch_size = 16
pad_on_right = tokenizer.padding_side == "right"
train_dataset = QADataset(train_data, model_name, max_seq_length)
val_dataset = QADataset(val_data, model_name, max_seq_length)
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)


In [99]:
train_dataset[0]

{'input_ids': tensor([[    0, 41613, 15389,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'start_positions': [0], 'end_positions': [0]}

In [69]:
tokenizer('<s>', '<s>')

{'input_ids': [0, 0, 2, 2, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
QAtrainer = QATrainer(
    model_name=model_name,
    train_dataset=tokenized_dataset['train'],
    val_dataset=tokenized_dataset['validation']
)

QAtrainer.training()

Downloading pytorch_model.bin:   0%|          | 0.00/595M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/karnaksp/longformer-base-4096-finetuned-squadv1 into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:


from transformers import BertConfig, BertModel

# bert_config = BertConfig(
#     vocab_size=32000,
#     hidden_size=768,
#     num_hidden_layers=12,
#     num_attention_heads=12,
#     intermediate_size=3072,
#     hidden_dropout_prob=0.1,
#     attention_probs_dropout_prob=0.1,
#     max_position_embeddings=512,
#     type_vocab_size=2,
#     initializer_range=0.02,
#     layer_norm_eps=1e-12,
#     gradient_checkpointing=False,
#     position_embedding_type="absolute",
#     use_cache=True,
#     is_decoder=False,
#     pad_token_id=0,
#     bos_token_id=1,
#     eos_token_id=2
# )

tokenizer = AutoTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
max_seq_length = 4000
batch_size = 16
epochs = eps = 1
     
train_dataset = QADataset(train_data, tokenizer, max_seq_length)
val_dataset = QADataset(val_data, tokenizer, max_seq_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# qa_model = QAModel(bert_config)
# qa_trainer = QATrainer(qa_model, train_dataloader, val_dataloader, lr=1e-12, eps=eps)
# train_losses, val_losses = qa_trainer.train(epochs)


In [None]:
def evaluate(model, val_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=segment_ids, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_loss += loss.item()
        avg_loss = total_loss / len(val_loader)
        return avg_loss

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for step, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=segment_ids)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
            end_preds = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
            for i in range(len(start_preds)):
                start_pred = np.argmax(start_preds[i])
                end_pred = np.argmax(end_preds[i])
                if start_pred > end_pred:
                    answer = ""
                else:
                    answer = tokenizer.decode(input_ids[i][start_pred:end_pred+1], skip_special_tokens=True)
                predictions.append({
                    "context": batch['context'][i],
                    "question": batch['question'][i],
                    "extracted_part": answer
                })
    with open('predictions.json', 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=4)


In [41]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.examples = self.create_qa_examples(data)
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.skip = False

    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        context = example['context']
        question = example['question']
        answer = example['answer']
        answer_start = example['answer_start']
        answer_end = example['answer_end']
        assert answer_end <= len(example['context'])
        is_char_in_ans = [0] * len(context)
        for i in range(answer_start, answer_end):
            is_char_in_ans[i] = 1
        tokenized_context = self.tokenizer.encode_plus(context, add_special_tokens=False, return_offsets_mapping=True, return_tensors="tf")
        ans_token_idx = []
        is_ans_token = [0] * len(tokenized_context)

        for idx, token in enumerate(tokenized_context):
            token_start = tokenized_context.token_to_chars(idx)[0]
            token_end = tokenized_context.token_to_chars(idx)[1]
            if sum(is_char_in_ans[token_start:token_end]) > 0:
                ans_token_idx.append(idx)
                for i in range(token_start, token_end):
                    is_ans_token[i] = 1
        if sum(is_ans_token) == 0:
            start_token_idx, end_token_idx = 0, 0
        else:
            start_token_idx = ans_token_idx[0]
            end_token_idx = ans_token_idx[-1]
            while start_token_idx > 0 and is_ans_token[tokenized_context.token_to_chars(start_token_idx-1)[0]]:
                start_token_idx -= 1
            while end_token_idx < len(tokenized_context)-1 and is_ans_token[tokenized_context.token_to_chars(end_token_idx+1)[1]-1]:
                end_token_idx += 1
            

        tokenized_question = self.tokenizer.encode_plus(question, return_offsets_mapping=True, return_tensors="tf")
        tokens = ['<s>'] + tokenized_context.tokens() + ['</s>']+ ['</s>'] + tokenized_question.tokens() + ['</s>']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        token_type_ids = [0] * (len(tokenized_context.tokens())+2) + [1] * (len(
            tokenized_question.tokens())+2)
        attention_mask = [1] * len(input_ids)
        padding_length = self.max_seq_length - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return
        features = []
#         encoded_dict = self.tokenizer.encode_plus(
#             question,
#             context,
#             add_special_tokens=True,
#             truncation='longest_first',
#             max_length=self.max_seq_length,
#             return_tensors='pt'
#         )
#         input_ids = encoded_dict['input_ids'].squeeze()
#         attention_mask = encoded_dict['attention_mask'].squeeze()
#         input_ids = torch.nn.functional.pad(encoded_dict['input_ids'], (0, self.max_seq_length - encoded_dict['input_ids'].shape[1]), mode='constant', value=0)
#         attention_mask = torch.nn.functional.pad(encoded_dict['attention_mask'], (0, self.max_seq_length - encoded_dict['attention_mask'].shape[1]), mode='constant', value=0)
        
        features = {'input_ids': input_ids, 'attention_mask': attention_mask, 
                    'token_type_ids': token_type_ids, 'start_token_idx': start_token_idx, 'end_token_idx': end_token_idx}
#         max_len_dict = {}
#         for key, value in features.items():
#             if isinstance(value, (list, tuple)):
#                 max_len_dict[key] = max(len(seq) for seq in value)
#         for key, value in features.items():
#             if isinstance(value, (list, tuple)):
#                 max_len = max_len_dict[key]
#                 for i in range(len(value)):
#                     pad_len = max_len - len(value[i])
#                     value[i] = torch.cat([value[i], torch.zeros(pad_len, dtype=torch.long)])
#                 features[key] = torch.stack(value)

        return features
    
    def create_qa_examples(self, data):
        examples = []
        for row in data:
            text = row['text']
            question = row['label']
            extracted_part = row.get('extracted_part', {})
            if extracted_part and 'text' in extracted_part:
                answer = extracted_part['text'][0].strip()
                answer_start = extracted_part['answer_start'][0]
                answer_end = extracted_part['answer_end'][0]
            else:
                answer = answer_start = answer_end = None

            example = {'context': text, 'question': question, 'answer': answer, 'answer_start': answer_start, 'answer_end': answer_end}
            examples.append(example)
        return examples
    
    
    @staticmethod
    def prepare_test_data(data):
        examples = []
        for row in data:
            text = row['text']
            question = row['label']
            example = {'context': text, 'question': question}
            examples.append(example)
        return examples

def collate_fn(batch, device):
    input_ids = pad_sequence([torch.tensor(example['input_ids']) for example in batch], batch_first=True, padding_value=0).to(device)
    attention_mask = pad_sequence([torch.tensor(example['attention_mask']) for example in batch], batch_first=True, padding_value=0).to(device)
    token_type_ids = pad_sequence([torch.tensor(example['token_type_ids']) for example in batch], batch_first=True, padding_value=0).to(device)
    start_positions = torch.tensor([example['answer_start'] for example in batch]).to(device)
    end_positions = torch.tensor([example['answer_end'] for example in batch]).to(device)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'start_positions': start_positions,
        'end_positions': end_positions
    }


def create_inputs_targets(dataset):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for idx in range(len(dataset)):
        example = dataset[idx]
        for key in dataset_dict:
            if isinstance(example[key], torch.Tensor):
                value = example[key].numpy().tolist()
            else:
                value = example[key]
            dataset_dict[key].append(value)


    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

def x_y_split(model_name, train_data, validation_data, batch_size):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = QADataset(train_data, tokenizer, max_seq_length)
#     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True,
#                                    collate_fn=lambda batch: collate_fn(batch, device))
    x_train, y_train = create_inputs_targets(train_dataset)
    
    validation_dataset = QADataset(validation_data, tokenizer, max_seq_length)
#     validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, drop_last=True,
#                                         collate_fn=lambda batch: collate_fn(batch, device))
    x_val, y_val = create_inputs_targets(validation_dataset)

    return x_train, y_train, x_val, y_val



def create_model(model_name):
    ## BERT encoder
    encoder = TFLongformerForQuestionAnswering.from_pretrained(model_name)

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

class ExactMatch(keras.callbacks.Callback):
    def __init__(self, x_eval, y_eval):
        super().__init__()
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            if pred_ans in squad_eg.all_answers:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")


In [17]:
tokenizer = AutoTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
examp = QADataset(train_data, tokenizer, max_seq_length)

In [27]:
QADataset(train_data, tokenizer, max_seq_length)

<__main__.QADataset at 0x707ab47127d0>

In [42]:
# загружаем данные для обучения
with open('/kaggle/input/nlp-test-task-2023/nlp_test_task_2023/dataset/train.json', 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# загружаем данные для предсказания
with open('/kaggle/input/nlp-test-task-2023/nlp_test_task_2023/dataset/test.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

# Разбиваем данные на обучающую и валидационную выборки
train_data, validation_data = train_test_split(train_data[:50], test_size=0.2, random_state=42)
max_seq_length = 4000
model_name = "valhalla/longformer-base-4096-finetuned-squadv1"
configuration = LongformerConfig()
num_epochs = 3

x_train, y_train, x_val, y_val = x_y_split(model_name = model_name, train_data = train_data, validation_data = validation_data, batch_size = 16)
max_len = len(x_train[0][0])

Downloading tf_model.h5:   0%|          | 0.00/595M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFLongformerForQuestionAnswering.

All the layers of TFLongformerForQuestionAnswering were initialized from the model checkpoint at valhalla/longformer-base-4096-finetuned-squadv1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerForQuestionAnswering for predictions without further training.


In [62]:
len(y_train[0])

40

In [None]:
x_train имеет структуру словаря, в котором есть 3 подсловаря - признака, в каждом из них набор примеров n-го количества, в каждом примере уже непосредственно находятся данные
в y_train 2 словаря, которые содержат n примеров, в каждом из которых находится таргет. 
какие модели обучения можно написать на таких данных, не пользуясь предобученными моделями и их ограничениями

In [8]:

tf.debugging.set_log_device_placement(True)

In [10]:
tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0



<tensorflow.python.client.session.Session at 0x79cc6d6283d0>

In [17]:
use_tpu = False  # Change to True if using TPU

if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        print('TPU used')
        model = create_model(model_name)
else:
#     # Use GPU if TPU is not available
#     if tf.config.list_physical_devices('GPU'):
#         strategy = tf.distribute.MirroredStrategy()
        
#     else:
    strategy = tf.distribute.OneDeviceStrategy(device="/CPU:0")

    with strategy.scope():
        model = create_model(model_name)

model.summary()


All model checkpoint layers were used when initializing TFLongformerForQuestionAnswering.

All the layers of TFLongformerForQuestionAnswering were initialized from the model checkpoint at valhalla/longformer-base-4096-finetuned-squadv1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerForQuestionAnswering for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


There should be exactly three separator tokens: 2 in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this. This is most likely an error. The global attention is disabled for this forward pass.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4000)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 4000)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4000)]       0           []                               
                                                                                                  
 tf_longformer_for_question_ans  TFLongformerQuestio  148070402  ['input_1[0][0]',                
 wering_1 (TFLongformerForQuest  nAnsweringModelOutp              'input_3[0][0]',            

In [7]:
import tensorflow as tf
print(tf.__version__)
print(tf.test.is_built_with_cuda())
print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))


2.11.0
True
True


In [18]:

x_train_np = [np.array(x_train[0]), np.array(x_train[1]), np.array(x_train[2])]
y_train_np = [np.array(y_train[0]), np.array(y_train[1])]
x_val_np = [np.array(x_val[0]), np.array(x_val[1]), np.array(x_val[2])]
y_val_np = [np.array(y_val[0]), np.array(y_val[1])]
exact_match_callback = ExactMatch(x_val_np, y_val_np)
model.fit(
    x_train_np,
    y_train_np,
    validation_data=(x_val_np, y_val_np),
    epochs=1,
    verbose=2,
    batch_size=64,
#     callbacks=[exact_match_callback],
)


There should be exactly three separator tokens: 2 in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this. This is most likely an error. The global attention is disabled for this forward pass.
There should be exactly three separator tokens: 2 in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this. This is most likely an error. The global attention is disabled for this forward pass.


InvalidArgumentError: Graph execution error:

Detected at node 'model/tf_longformer_for_question_answering_1/longformer/embeddings/Gather_2' defined at (most recent call last):
    File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2976, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3258, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_27/1655000812.py", line 12, in <module>
      batch_size=64,
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1023, in train_step
      y_pred = self(x, training=True)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/transformers/modeling_tf_utils.py", line 2224, in run_call_with_unpacked_inputs
      version=1,
    File "/opt/conda/lib/python3.7/site-packages/transformers/models/longformer/modeling_tf_longformer.py", line 2275, in call
      outputs = self.longformer(
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/transformers/modeling_tf_utils.py", line 2224, in run_call_with_unpacked_inputs
      version=1,
    File "/opt/conda/lib/python3.7/site-packages/transformers/models/longformer/modeling_tf_longformer.py", line 1764, in call
      embedding_output = self.embeddings(
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/transformers/models/longformer/modeling_tf_longformer.py", line 571, in call
      token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
Node: 'model/tf_longformer_for_question_answering_1/longformer/embeddings/Gather_2'
indices[37,2809] = 1 is not in [0, 1)
	 [[{{node model/tf_longformer_for_question_answering_1/longformer/embeddings/Gather_2}}]] [Op:__inference_train_function_195674]

In [39]:
train_dataset = QADataset(train_data, tokenizer, max_seq_length)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True,
                               collate_fn=lambda batch: collate_fn(batch, device))

In [57]:
train_dataloader.dataset[2]

{'input_ids': tensor([    2, 25751, 43691,  ...,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0]),
 'answer_start': tensor(0),
 'answer_end': tensor(0)}

In [11]:
# tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
# examples = QADataset.create_qa_examples(train_data, train_data)
# questions = [example['context'] for example in examples]
# question_tokens = [tokenizer.tokenize(question) for question in questions]
# import matplotlib.pyplot as plt

# question_lengths = [len(tokens) for tokens in question_tokens]
# plt.hist(question_lengths, bins=50)
# plt.xlabel('Length of question tokens')
# plt.ylabel('Frequency')
# plt.show()

In [5]:
# # Разбиваем данные на обучающую и валидационную выборки
# train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)
# max_seq_length = 3072
# model_name = "allenai/longformer-large-4096-finetuned-triviaqa"

# # Число эпох обучения
# num_epochs = 3
# output_dir = 'my_model'

In [8]:
# 'cointegrated/LaBSE-en-ru'


In [None]:
# from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering

# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
# model = BigBirdForQuestionAnswering.from_pretrained('google/bigbird-roberta-base')


In [None]:

# import tensorflow as tf

# train(model_name, train_data, validation_data, output_dir)