# Demo for using Transformers with TydiQA

## Dataset creation

In [1]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/home/gadelampe/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
train_set = train_set.to_pandas()
validation_set = validation_set.to_pandas()

In [3]:
languages_chosen = ["english"]#,"japanese","finnish"]

train_set = train_set[train_set['language'].isin(languages_chosen)].reset_index(drop=True)
validation_set = validation_set[validation_set['language'].isin(languages_chosen)].reset_index(drop=True)

In [4]:
def get_answerability(annotations):
    answerable = []
    for annot in annotations:
        if -1 in annot['answer_start']:
            answerable.append(0)
        else:
            answerable.append(1)
    return answerable


train_annotations = train_set['annotations'].tolist()
validation_annotations = validation_set['annotations'].tolist()


train_set['answerable'] = get_answerability(train_annotations)
validation_set['answerable'] = get_answerability(validation_annotations)
train_set.columns

Index(['question_text', 'document_title', 'language', 'annotations',
       'document_plaintext', 'document_url', 'answerable'],
      dtype='object')

In [5]:
import numpy as np

answer_text = np.empty((len(train_set), 1), dtype=object)

for i in range(len(train_set)):
    answer_text[i] = train_annotations[i]['answer_text']
train_set['answer_text'] = answer_text


answer_text = np.empty((len(validation_set), 1), dtype=object)

for i in range(len(validation_set)):
    answer_text[i] = validation_annotations[i]['answer_text']
validation_set['answer_text'] = answer_text

In [7]:
from nltk.tokenize import WordPunctTokenizer, TreebankWordTokenizer, RegexpTokenizer

punct = WordPunctTokenizer()
tree = TreebankWordTokenizer()
reg = RegexpTokenizer('\w+[\'\-+]?\w*|\[\d+\]')
# reg = RegexpTokenizer('\w+[\'\-+]?\w*|[1-9]\d{0,2}(,\d{3})*(\.\d+)?|\[\d+\]')

train_set['answer_text_tokenized'] = [reg.tokenize(i) for i in train_set.answer_text.values]
validation_set['answer_text_tokenized'] = [reg.tokenize(i) for i in validation_set.answer_text.values]
train_set['doc_plaintext_tokenized_temp'] = [reg.tokenize(i) for i in train_set.document_plaintext.values]
validation_set['doc_plaintext_tokenized_temp'] = [reg.tokenize(i) for i in validation_set.document_plaintext.values]


print(train_set['answer_text_tokenized'][4])
print(train_set['doc_plaintext_tokenized_temp'][1])

['Grasshoppers', 'are', 'plant-eaters', 'with', 'a', 'few', 'species', 'at', 'times', 'becoming', 'serious', 'pests', 'of', 'cereals', 'vegetables', 'and', 'pasture', 'especially', 'when', 'they', 'swarm', 'in', 'their', 'millions', 'as', 'locusts', 'and', 'destroy', 'crops', 'over', 'wide', 'areas']
['The', 'Nobel', 'Prize', 'in', 'Literature', 'Swedish', 'Nobelpriset', 'i', 'litteratur', 'is', 'awarded', 'annually', 'by', 'the', 'Swedish', 'Academy', 'to', 'authors', 'for', 'outstanding', 'contributions', 'in', 'the', 'field', 'of', 'literature', 'It', 'is', 'one', 'of', 'the', 'five', 'Nobel', 'Prizes', 'established', 'by', 'the', '1895', 'will', 'of', 'Alfred', 'Nobel', 'which', 'are', 'awarded', 'for', 'outstanding', 'contributions', 'in', 'chemistry', 'physics', 'literature', 'peace', 'and', 'physiology', 'or', 'medicine', '[1]', 'As', 'dictated', 'by', "Nobel's", 'will', 'the', 'award', 'is', 'administered', 'by', 'the', 'Nobel', 'Foundation', 'and', 'awarded', 'by', 'a', 'commi

In [18]:
def get_iob_tags(ans_text_token, doc_text_token):
    IOB = [ [] for _ in range(len(doc_text_token)) ]
    for i, elm in enumerate(doc_text_token):
        for j, x in enumerate(elm):
            if x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]=='O' or x in ans_text_token[i] and j==0:
                IOB[i].append('B')
            elif x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]=='B' or x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]=='I':
                IOB[i].append('I')
            else:
                IOB[i].append('O')
    return IOB

train_ans_token = train_set['answer_text_tokenized'].tolist()
validation_ans_token = validation_set['answer_text_tokenized'].tolist()

train_doc_token = train_set['doc_plaintext_tokenized_temp'].tolist()
validation_doc_token = validation_set['doc_plaintext_tokenized_temp'].tolist()


train_set['IOB'] = get_iob_tags(train_ans_token, train_doc_token)
validation_set['IOB'] = get_iob_tags(validation_ans_token, validation_doc_token)

In [5]:
# train_set = train_set.drop(columns=['annotations', 'document_url'])
# validation_set = validation_set.drop(columns=['annotations', 'document_url'])

## Utility Functions for combining Question and Context

It is importatn to note that combining the Question and context within the text in not necessarily the best solution 

In [20]:
def question_parag_combine(questions, paragraphs):
    """
    This function combines the questions and paragraphs into a single text
    Args:
        questions: list of questions
        paragraphs: list of paragraphs
    Returns:
        list of combined questions and paragraphs
    """
    training_data = []
    for index in range(len(questions)):
        training_data += [questions[index] + "\n" + paragraphs[index]]
        
    return training_data

def get_data_with_cond(data_set, cond, vectorizer):
    """
    This function returns the data with the given condition (can be used to get data for a particular language).
    vectorizer is used to vectorize the data: it can be a CountVectorizer or a TfidfVectorizer, etc.
    If vectorizer is None, then the combined data is returned as is.
    Args:
        data_set: pandas dataframe
        cond: condition to be applied
        vectorizer: vectorizer to be used
    Returns:
        data with the given condition
    """

    d_q = data_set[cond]['question_text'].tolist()
    d_p = data_set[cond]['document_plaintext'].tolist()
    data = question_parag_combine(d_q,d_p)

    print(len(d_q))
    if vectorizer is None:
        return data 
    
    X = vectorizer.transform(data)
    y = data_set[cond]['answerable'].tolist()
    
    return X,y

#example use 
# cond_eng = validation_set['language'] == 'english'
# X_eng, y_eng = get_data_with_cond(validation_set, cond_eng, vectorizer)



## Preprocessing and Tokenization

In [21]:
from transformers import AutoTokenizer
from datasets import DatasetDict, Dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


d_q = train_set['question_text'].tolist()
d_p = train_set['document_plaintext'].tolist()
training_data = question_parag_combine(d_q,d_p)
training_labels = train_set['answerable'].tolist()

d_q = validation_set['question_text'].tolist()
d_p = validation_set['document_plaintext'].tolist()
validation_data = question_parag_combine(d_q,d_p)
validation_labels = validation_set['answerable'].tolist()



data_set = {}
sets = [['train',training_data, training_labels], ['val',validation_data, validation_labels]]
for meta in sets:
    data_set[meta[0]] = {}
    data_set[meta[0]]['text'] = []
    data_set[meta[0]]['label'] = []
    
    for ind, text in enumerate(meta[1]):
        data_set[meta[0]]['text'].append(text)
        data_set[meta[0]]['label'].append(meta[2][ind])

        

        
data_set = DatasetDict({'train':Dataset.from_dict(data_set['train']),
                        'valid':Dataset.from_dict(data_set['val'])\
                       })

# training_data = tokenize_data(training_data)
#  validation_data = tokenize_data(validation_data)
tokenized_datasets = data_set.map(tokenize_function, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training Using the HF native trainer


It is important to note that this is by far not the only way to train and one can use native Torch, TF or extensions akin to Pytorch Lightning etc.

In [22]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7389
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 990
    })
})

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [24]:
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric('f1')

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=1, \
                                  do_train = True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

2022-10-20 13:51:54.600775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-20 13:51:55.122169: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-20 13:51:55.122206: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-20 13:51:55.191367: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-20 13:51:56.229827: W tensorflow/stream_executor/platform/de

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],#.shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_datasets['valid'],
    compute_metrics=compute_metrics,
)

***Notice*** that for demo purposes I am sampling only 2000 elements from the training dataset and completing 1 epoch of trinaing, however in order to complete a thorough trianing you can remove `.shuffle(seed=42).select(range(2000)` and change the `num_train_epochs` to a different number.

Also it is important to know that in the current mode all of bert-s layers will be retrained w.r.t our dataset, however it might be more reasonable to freeze the pretrained LM and tune only the last layer (classification head).

In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7389
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 924


  0%|          | 0/924 [00:00<?, ?it/s]

Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json


{'loss': 0.4998, 'learning_rate': 2.2943722943722946e-05, 'epoch': 0.54}


Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin


KeyboardInterrupt: 

We see that training with 2000 examples only for one epoch is deffinetely not sufficient. Maybe its the way we concatinate context to questions, or maybe just the training.


***Beating the deadline is left as an exercise for the reader.***