# Mini-Challenge 2 npr
## Q&A

In [1]:
import json
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from utils import *

config = load_yaml_file('electra_large.yaml')

##
GPU_USAGE = config['GPU_USAGE']
BATCH_SIZE = config['BATCH_SIZE']
MAX_LENGTH = config['MAX_LENGTH']
DOC_STRIDE = config['DOC_STRIDE']
##

tokenizer = AutoTokenizer.from_pretrained("google/electra-large-discriminator")
model = AutoModelForQuestionAnswering.from_pretrained("google/electra-large-discriminator").to("cuda:0")

PAD_RIGHT = tokenizer.padding_side == "right"

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['qa_outputs.bias', 'qa_outputs.

In [2]:
torch.cuda.is_available()

True

# Load Data

In [3]:
data = load_dataset("squad_v2")
# with open('data/train-v2.0.json') as f:
#     data = json.load(f)

Reusing dataset squad_v2 (C:\Users\vince\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 166.68it/s]


In [4]:
tokenized_datasets = data.map(
    prepare_train_features, 
    fn_kwargs={
        'tokenizer':tokenizer, 
        'PAD_RIGHT':PAD_RIGHT,
        'MAX_LENGTH':MAX_LENGTH, 
        'DOC_STRIDE':DOC_STRIDE
        }, 
    batched=True,
    remove_columns=data["train"].column_names)

100%|██████████| 131/131 [00:29<00:00,  4.44ba/s]
100%|██████████| 12/12 [00:02<00:00,  4.55ba/s]


# Train Model

In [5]:
model_name = "google/electra-large-discriminator".split("/")[-1]

import os
os.environ["WANDB_DISABLED"] = "true"

args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,

)

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 136184
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 102138
  0%|          | 0/102138 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 1.79 GiB already allocated; 18.41 MiB free; 1.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model("test-squad-trained_electra-large")

Saving model checkpoint to test-squad-trained_xlnet_RETRAINED
Configuration saved in test-squad-trained_xlnet_RETRAINED\config.json
Model weights saved in test-squad-trained_xlnet_RETRAINED\pytorch_model.bin
tokenizer config file saved in test-squad-trained_xlnet_RETRAINED\tokenizer_config.json
Special tokens file saved in test-squad-trained_xlnet_RETRAINED\special_tokens_map.json
