# Fine tuning a NLP model for question answering from a pretrained Hugging Face's transformer model

### Data dowloading

In [2]:
import requests
import os

if not os.path.exists('./data/benchmarks/squad'):
    os.makedirs('./data/benchmarks/squad')
    os.makedirs('./data/outputs')

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
files = ['train-v2.0.json', 'dev-v2.0.json']

for file in files:
    req = requests.get(f'{url}{file}')
    # write file
    with open(f'./data/benchmarks/squad/{file}', 'wb') as f:
        for chunk in req.iter_content(chunk_size=5):
            f.write(chunk)

### Data preprocessing

In [30]:
import json

def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# apply function
train_contexts, train_questions, train_answers = read_squad('data/benchmarks/squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('data/benchmarks/squad/dev-v2.0.json')

### Tokenize

The idea is to use a custom dataset, so we need to make sure that it has the same structure as squad from Hugging Face dataset

In [31]:
def create_answers_dict(x):
    dict_ = {"text":[(x["answer"])], "answer_start":[(int(x["answer_start"]))]}
    return dict_

In [33]:
import pandas as pd

#train
contexts_df_train = pd.DataFrame(train_contexts, columns=['context'])
questions_df_train = pd.DataFrame(train_questions, columns=['question'])
answers_df_train = pd.DataFrame.from_records(train_answers)
df_train = contexts_df_train.copy()
df_train["question"] = questions_df_train["question"]
df_train["answer"] = answers_df_train["text"]
df_train["answer_start"] = answers_df_train["answer_start"]
df_train.reset_index(inplace=True, drop = False)
df_train.rename(columns={'index':'id'}, inplace=True)
df_train["answers"] = df_train.apply(lambda x: create_answers_dict(x), axis = 1)

#test
contexts_df_test = pd.DataFrame(val_contexts, columns=['context'])
questions_df_test = pd.DataFrame(val_questions, columns=['question'])
answers_df_test = pd.DataFrame.from_records(val_answers)
df_test = contexts_df_test.copy()
df_test["question"] = questions_df_test["question"]
df_test["answer"] = answers_df_test["text"]
df_test["answer_start"] = answers_df_test["answer_start"]
df_test.reset_index(inplace=True, drop = False)
df_test.rename(columns={'index':'id'}, inplace=True)
df_test["answers"] =  df_test.apply(lambda x: create_answers_dict(x), axis = 1)
df_test.tail()

Unnamed: 0,id,context,question,answer,answer_start,answers
20297,20297,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,665,"{'text': ['sthène'], 'answer_start': [665]}"
20298,20298,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,665,"{'text': ['sthène'], 'answer_start': [665]}"
20299,20299,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,665,"{'text': ['sthène'], 'answer_start': [665]}"
20300,20300,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,665,"{'text': ['sthène'], 'answer_start': [665]}"
20301,20301,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,665,"{'text': ['sthène'], 'answer_start': [665]}"


Saving dataframes to .csv

In [34]:
df_train.sample(frac = 0.5)[['id', 'context', 'question', 'answers']].to_csv('data/outputs/dataset_train.csv', index=False)
df_test.sample(frac = 0.5)[['id', 'context', 'question', 'answers']].to_csv('data/outputs/dataset_test.csv', index=False)

Loading .csv files from load_dataset

In [15]:
from datasets import load_dataset

data_files = {"train": "data/outputs/dataset_train.csv", "test": "data/outputs/dataset_test.csv"}
ds = load_dataset("csv", data_files=data_files)
ds

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10000
    })
})

The column "answers" wasn't saved as a dict (python data structure), it's a string. We nee to change from that string containing the dict to a real dict

In [17]:
def convert_text(batch):
  aux_list = []
  for x, y in zip(batch["answers"], batch["answers"]):
    my_dict = {"text":eval(x)["text"], "answer_start":eval(x)["answer_start"]}
    aux_list.append(my_dict)

  return {"texts":aux_list}

prepared_ds = ds.map(convert_text, batched = True)
prepared_ds = prepared_ds.remove_columns("answers")
prepared_ds = prepared_ds.rename_column("texts", "answers")

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Start and end positions tokens

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    print(offset_mapping)
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenized_squad = prepared_ds.map(preprocess_function, batched=True, remove_columns=ds["train"].column_names)

### Login to Hugging Face

In [21]:
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv('./secret/keys.env')
HUGGING_FACE_API_KEY = os.getenv("HUGGINGFACE_TOKEN")
login(token = HUGGING_FACE_API_KEY)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Training

Define training arguments and trainer

In [32]:
from transformers import DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer

data_collator = DefaultDataCollator()

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir="qa_nlp_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.3546,1.391137
2,1.0227,1.311754
3,0.7796,1.359446


TrainOutput(global_step=4689, training_loss=1.2061929635641595, metrics={'train_runtime': 3015.4492, 'train_samples_per_second': 24.872, 'train_steps_per_second': 1.555, 'total_flos': 7349236876800000.0, 'train_loss': 1.2061929635641595, 'epoch': 3.0})

Save model

In [35]:
trainer.save_model()
metrics = trainer.evaluate(tokenized_squad["test"])

kwargs = {
    "finetuned_from": model.config._name_or_path,
    "tasks": "question-answering",
    "dataset": "squad",
    "tags":["question-answering", "nlp"]
}

Epoch,Training Loss,Validation Loss
0,No log,1.412295


Push to Hugging Face Hub

In [36]:
trainer.push_to_hub(commit_message = "model tuned", **kwargs)

'https://huggingface.co/jolual2747/qa_nlp_model/tree/main/'