In [1]:
import json

def read_json(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# apply function
train_contexts, train_questions, train_answers = read_json('/kaggle/input/mashqa-dataset/train_webmd_squad_v2_consec.json')
val_contexts, val_questions, val_answers = read_json('/kaggle/input/mashqa-dataset/val_webmd_squad_v2_consec.json')

### Tokenize

The idea is to use a custom dataset, so we need to make sure that it has the same structure as squad from Hugging Face dataset

In [2]:
def create_answers_dict(x):
    dict_ = {"text":[(x["answer"])], "answer_start":[(int(x["answer_start"]))]}
    return dict_

In [3]:
import pandas as pd

#train
contexts_df_train = pd.DataFrame(train_contexts, columns=['context'])
questions_df_train = pd.DataFrame(train_questions, columns=['question'])
answers_df_train = pd.DataFrame.from_records(train_answers)
df_train = contexts_df_train.copy()
df_train["question"] = questions_df_train["question"]
df_train["answer"] = answers_df_train["text"]
df_train["answer_start"] = answers_df_train["answer_start"]
df_train.reset_index(inplace=True, drop = False)
df_train.rename(columns={'index':'id'}, inplace=True)
df_train["answers"] = df_train.apply(lambda x: create_answers_dict(x), axis = 1)

#test
contexts_df_test = pd.DataFrame(val_contexts, columns=['context'])
questions_df_test = pd.DataFrame(val_questions, columns=['question'])
answers_df_test = pd.DataFrame.from_records(val_answers)
df_test = contexts_df_test.copy()
df_test["question"] = questions_df_test["question"]
df_test["answer"] = answers_df_test["text"]
df_test["answer_start"] = answers_df_test["answer_start"]
df_test.reset_index(inplace=True, drop = False)
df_test.rename(columns={'index':'id'}, inplace=True)
df_test["answers"] =  df_test.apply(lambda x: create_answers_dict(x), axis = 1)
df_test.head()

Unnamed: 0,id,context,question,answer,answer_start,answers
0,0,If it's temporary and only happens occasionall...,What are some conditions that can lead to erec...,Other options your doctor can help you explore...,714,{'text': ['Other options your doctor can help ...
1,1,If it's temporary and only happens occasionall...,"If I see a urologist for erectile dysfunction,...",The urologist will ask what happens when you h...,3816,{'text': ['The urologist will ask what happens...
2,2,If it's temporary and only happens occasionall...,What if I'm concerned about talking to my doct...,"The best approach is just to say, "" I think I ...",2565,"{'text': ['The best approach is just to say, ""..."
3,3,If it's temporary and only happens occasionall...,What questions might my doctor ask about my er...,The questions may include: Do you ever get an ...,3071,{'text': ['The questions may include: Do you e...
4,4,Prostate cancer is not often a cause of erecti...,What is the link between prostate cancer and e...,"However, treatments for the disease can cause ...",70,"{'text': ['However, treatments for the disease..."


Saving dataframes to .csv

In [4]:

df_train.sample(frac = 0.5)[['id', 'context', 'question', 'answers']].to_csv('/kaggle/working/dataset_train.csv', index=False)
df_test.sample(frac = 0.5)[['id', 'context', 'question', 'answers']].to_csv('/kaggle/working/dataset_test.csv', index=False)

In [5]:
from datasets import load_dataset

data_files = {"train": "/kaggle/working/dataset_train.csv", "test": "/kaggle/working/dataset_test.csv"}
ds = load_dataset("csv", data_files=data_files)
ds

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 9994
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1343
    })
})

The column "answers" wasn't saved as a dict (python data structure), it's a string. We nee to change from that string containing the dict to a real dict

In [6]:
def convert_text(batch):
  aux_list = []
  for x, y in zip(batch["answers"], batch["answers"]):
    my_dict = {"text":eval(x)["text"], "answer_start":eval(x)["answer_start"]}
    aux_list.append(my_dict)

  return {"texts":aux_list}

prepared_ds = ds.map(convert_text, batched = True)
prepared_ds = prepared_ds.remove_columns("answers")
prepared_ds = prepared_ds.rename_column("texts", "answers")

Map:   0%|          | 0/9994 [00:00<?, ? examples/s]

Map:   0%|          | 0/1343 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    print(offset_mapping)
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
import sys
import io
from contextlib import redirect_stdout

def quiet_map(dataset, function, **kwargs):
    # Create a string buffer
    buffer = io.StringIO()

    # Redirect the standard output to the buffer
    with redirect_stdout(buffer):
        # Call the map function
        result = dataset.map(function, **kwargs)

    # Return the result
    return result

# Now you can call the quiet_map function instead of the map function
tokenized_squad = quiet_map(prepared_ds, preprocess_function, batched=True, remove_columns=ds["train"].column_names)

Map:   0%|          | 0/9994 [00:00<?, ? examples/s]

Map:   0%|          | 0/1343 [00:00<?, ? examples/s]

### Login to Hugging Face

In [9]:
from dotenv import load_dotenv
from huggingface_hub import login

HUGGING_FACE_API_KEY = ''
login(token = HUGGING_FACE_API_KEY)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Training
Define training arguments and trainer

In [12]:
from transformers import DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer

data_collator = DefaultDataCollator()

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir="distilbert-qa",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,1.9288,1.466246
2,1.4787,1.341967
3,1.3272,1.326671
4,1.1133,1.372438


TrainOutput(global_step=2500, training_loss=1.4100805908203125, metrics={'train_runtime': 860.7079, 'train_samples_per_second': 46.445, 'train_steps_per_second': 2.905, 'total_flos': 3917241245159424.0, 'train_loss': 1.4100805908203125, 'epoch': 4.0})

In [13]:
trainer.save_model()
metrics = trainer.evaluate(tokenized_squad["test"])

kwargs = {
    "finetuned_from": model.config._name_or_path,
    "tasks": "question-answering",
    "dataset": "mashqa_dataset",
    "tags":["question-answering", "nlp"]
}

events.out.tfevents.1716765168.e0cf7a6de512.34.2:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

In [14]:
trainer.push_to_hub(commit_message = "model tuned", **kwargs)

events.out.tfevents.1716766080.e0cf7a6de512.34.3:   0%|          | 0.00/359 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Eurosmart/distilbert-qa/commit/449449d38d6e324d9bfd63ae7a2ab15535052a42', commit_message='model tuned', commit_description='', oid='449449d38d6e324d9bfd63ae7a2ab15535052a42', pr_url=None, pr_revision=None, pr_num=None)