In [1]:
!pip install torch transformers[torch] datasets



In [2]:
from datasets import load_dataset

squad_dataset = load_dataset("squad")

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
print(squad_dataset["train"][0])
print(squad_dataset["validation"][0])

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}
{'id': '56be4db0acb8001400a502ec', 'title': 'Super_Bow

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # Tokenize the questions and contexts together
    tokenized_examples = tokenizer(examples["question"], examples["context"], truncation=True, padding="max_length", max_length=384)

    # Initialize lists for the start and end positions
    start_positions = []
    end_positions = []

    for i, answer in enumerate(examples["answers"]):
        # Find the start and end of the answer in the context
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Convert character position to token position
        start_token = tokenized_examples.char_to_token(i, start_char)
        end_token = tokenized_examples.char_to_token(i, end_char - 1)  # Subtract 1 because end_char is exclusive

        # If the answer cannot be found in the tokens (might be truncated), set to 0
        start_positions.append(start_token if start_token is not None else 0)
        end_positions.append(end_token if end_token is not None else 0)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

# Apply the preprocessing function to the dataset
tokenized_squad = squad_dataset.map(preprocess_data, batched=True, remove_columns=squad_dataset["train"].column_names)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [7]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def check_data_integrity(dataset):
    for idx, sample in enumerate(dataset):
        # Check if any of the required fields are None
        if sample['input_ids'] is None or sample['attention_mask'] is None:
            print(f"Missing data at index {idx}: input_ids or attention_mask is None")
        if 'start_positions' in sample and sample['start_positions'] is None:
            print(f"Missing data at index {idx}: start_positions is None")
        if 'end_positions' in sample and sample['end_positions'] is None:
            print(f"Missing data at index {idx}: end_positions is None")

# Run this function for both training and validation datasets
check_data_integrity(tokenized_squad["train"])
check_data_integrity(tokenized_squad["validation"])

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
from transformers import TrainingArguments, Trainer, default_data_collator

collator = default_data_collator

training_args = TrainingArguments(
    output_dir="./bert_squad/",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./bert_logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=collator
)

trainer.train()

2024-04-01 07:59:12.505966: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 07:59:12.506110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 07:59:12.773862: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,1.1554
1000,0.4698
1500,0.4319
2000,0.3803
2500,0.3784
3000,0.3477
3500,0.2917
4000,0.3055
4500,0.2912
5000,0.2883


TrainOutput(global_step=5476, training_loss=0.42086395196796245, metrics={'train_runtime': 7566.5875, 'train_samples_per_second': 23.154, 'train_steps_per_second': 0.724, 'total_flos': 3.4334001889975296e+16, 'train_loss': 0.42086395196796245, 'epoch': 2.0})

In [11]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.33282092213630676, 'eval_runtime': 145.4814, 'eval_samples_per_second': 72.655, 'eval_steps_per_second': 0.571, 'epoch': 2.0}


In [12]:
trainer.save_model("./bert_squad_final/")

In [18]:
import os
import zipfile

In [19]:
def zip_model(model_directory, output_filename):
    # Create a Zip file
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the model directory
        for root, dirs, files in os.walk(model_directory):
            for file in files:
                # Create a relative path for files to keep the directory structure
                relative_path = os.path.relpath(os.path.join(root, file), os.path.join(model_directory, '..'))
                zipf.write(os.path.join(root, file), relative_path)

# Usage
model_directory = './bert_squad_final/'  # The directory where your model weights are stored
output_filename = '/kaggle/working/bert_squad_final.zip'  # Output zip file name and path
zip_model(model_directory, output_filename)