# Mini-Challenge 2 npr
## Q&A

In [None]:
import json
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from utils import *

config = load_yaml_file('distilbert_config.yaml')
##
GPU_USAGE = config['GPU_USAGE']
BATCH_SIZE = config['BATCH_SIZE']
MAX_LENGTH = config['MAX_LENGTH']
DOC_STRIDE = config['DOC_STRIDE']
MAX_ANSWER_LENGTH = config['MAX_ANSWER_LENGTH']
squad_v2 = config['squad_v2']
N_BEST_SIZE = config['N_BEST_SIZE']
##

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased").to("cuda:0")

PAD_RIGHT = tokenizer.padding_side == "right"

In [None]:
torch.cuda.is_available()

# Load Data

In [None]:
data = load_dataset("squad_v2")
# with open('data/train-v2.0.json') as f:
#     data = json.load(f)

In [None]:
tokenized_datasets = data.map(
    prepare_train_features, 
    fn_kwargs={
        'tokenizer':tokenizer, 
        'PAD_RIGHT':PAD_RIGHT,
        'MAX_LENGTH':MAX_LENGTH, 
        'DOC_STRIDE':DOC_STRIDE
        }, 
    batched=True,
    remove_columns=data["train"].column_names)

In [None]:
tokenized_datasets

# Train Model

In [None]:
model_name = "distilbert-base-uncased".split("/")[-1]

import os
os.environ["WANDB_DISABLED"] = "true"

args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,

)

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
trainer.save_model("test-squad-trained")