In [1]:
%load_ext autoreload
%autoreload 2

# About

This notebook takes the training data in `pos_neg_instances` and outputs a model.

# Load and preprocess data

In [2]:
import ast
from dataclasses import dataclass
import json
import pickle
import random
from typing import List, Union

from bs4 import BeautifulSoup
import requests
from internetarchive import search_items, get_item, Search
from torch.utils.data.dataloader import DataLoader
from transformers import BertForTokenClassification, AdamW, BertTokenizer, BertTokenizerFast, BatchEncoding, TrainingArguments, Trainer

from ner_pipeline.scrape_for_training import do_search, prepare_data, load_scraped_data, get_scraped_dataset_size
from ner_pipeline.containers import TraingingBatch
from ner_pipeline.dataset_ner import TrainingDataset
from ner_pipeline.dataset_ner import TrainingExample
from ner_pipeline.labelset import LabelSet

In [3]:
# Directly load the previously scraped pos/neg instances from saved text files.
NUM_EXAMPLES: int = 10  # 10000
POS_INSTANCES: list[dict[str, Union[str, dict[str, Union, str, int]]]] = \
    load_scraped_data(f"pos_neg_instances/pos_instances_{NUM_EXAMPLES}.txt")
NEG_INSTANCES: list[dict[str, Union[str, dict[str, Union, str, int]]]] = \
    load_scraped_data(f"pos_neg_instances/neg_instances_{NUM_EXAMPLES}.txt")

In [4]:
# Since did not find 10000 positive instances and 10000 negative instances, 
# we take 5000 each in this case.
NUM_FOR_TRAINING: int = 10  # 5000
LABELED_DATA: list[dict[str, Union[str, dict[str, Union, str, int]]]] = \
    POS_INSTANCES[:NUM_FOR_TRAINING] + NEG_INSTANCES[:NUM_FOR_TRAINING]

In [5]:
len(LABELED_DATA)

20

In [6]:
print(LABELED_DATA[:3])

[{'content': 'I Megarians for Salamis, they quoted Iliad 2. 558, where ', 'annotations': [{'start': 37, 'end': 49, 'label': 'Citation'}]}, {'content': 'Megarians for Salamis, they quoted Iliad 2. 558, where ', 'annotations': [{'start': 35, 'end': 47, 'label': 'Citation'}]}, {'content': 'Megarians for Salamis, they quoted Iliad 2. 558, where ', 'annotations': [{'start': 35, 'end': 47, 'label': 'Citation'}]}]


In [7]:
# note: in future, consider shuffling pos/neg seperately
random.shuffle(LABELED_DATA)

In [8]:
DATASET_SIZE: int = len(LABELED_DATA)
print(DATASET_SIZE)

20


In [9]:
DATASET_TRAIN: list[dict[str, Union[str, dict[str, Union, str, int]]]] = \
    LABELED_DATA[:DATASET_SIZE*17//20] # 85% training data
DATASET_TRAIN_SIZE = len(DATASET_TRAIN)
print(DATASET_TRAIN_SIZE)

17


In [10]:
DATASET_TEST = LABELED_DATA[DATASET_SIZE*17//20:] # 15% testing data
DATASET_TEST_SIZE = len(DATASET_TEST)

print("Number of instances for training: " + str(DATASET_TRAIN_SIZE))
print("Number of instances for testing: " + str(DATASET_TEST_SIZE))

Number of instances for training: 17
Number of instances for testing: 3


In [11]:
# Save instances for training(train and eval) and testing
TRAIN_FP: str = f"labeled_data/train_{DATASET_TRAIN_SIZE}_of_{DATASET_SIZE}.pickle"
TEST_FP: str = f"labeled_data/test_{DATASET_TEST_SIZE}_of_{DATASET_SIZE}.pickle"

with open(TRAIN_FP, "wb") as DATASET_TRAIN_FILE:
    pickle.dump(DATASET_TRAIN, DATASET_TRAIN_FILE)

with open(TEST_FP, "wb") as DATASET_TEST_FILE:
    pickle.dump(DATASET_TEST, DATASET_TEST_FILE)

In [12]:
# load the pickle file
LOAD_TRAINING_PICKLES = False
if LOAD_TRAINING_PICKLES:
    with open(TRAIN_FP, "rb") as DATASET_MODEL_FILE:
        DATASET_TRAIN = pickle.load(DATASET_MODEL_FILE)
    with open(TEST_FP, "rb") as DATASET_MODEL_FILE:
        DATASET_TEST = pickle.load(DATASET_MODEL_FILE)

# Configure BERT for training

In [13]:
TOKENIZER: BertTokenizerFast = BertTokenizerFast.from_pretrained('bert-base-cased')

In [14]:
LABEL_SET: LabelSet = LabelSet(labels=["Citation"]) #Only one label in this dataset

In [15]:
# TODO: Understand why more results are returned than sent

IL_OD_TRAINING_DATASET: TrainingDataset = TrainingDataset(
    data=DATASET_TRAIN, tokenizer=TOKENIZER, label_set=LABEL_SET, tokens_per_batch=16
)
print(len(IL_OD_TRAINING_DATASET))

23


In [16]:
print(IL_OD_TRAINING_DATASET[3])

TrainingExample(input_ids=[117, 119, 119, 191, 132, 25550, 132, 178, 118, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], labels=[0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100])


In [17]:
IL_OD_NER_TRAIN: list[TrainingExample] = IL_OD_TRAINING_DATASET[:len(IL_OD_TRAINING_DATASET)*17//20]
IL_OD_NER_EVAL = IL_OD_TRAINING_DATASET[len(IL_OD_TRAINING_DATASET)*17//20:]
print("Size of dataset for train: " + str(len(IL_OD_NER_TRAIN)))
print("Size of dataset for eval: " + str(len(IL_OD_NER_EVAL)))

Size of dataset for train: 19
Size of dataset for eval: 4


In [18]:
# Get the label list
print(IL_OD_TRAINING_DATASET.label_set.ids_to_label.values())

dict_values(['O', 'B-Citation', 'I-Citation', 'L-Citation', 'U-Citation'])


In [19]:
MODEL: BertForTokenClassification = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(IL_OD_TRAINING_DATASET.label_set.ids_to_label.values())
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [20]:
TRAINING_ARGS: TrainingArguments = TrainingArguments("test_trainer")

In [21]:
TRAINER: Trainer = Trainer(
    model=MODEL,
    args=TRAINING_ARGS,
    train_dataset=IL_OD_NER_TRAIN,
    eval_dataset=IL_OD_NER_EVAL
)

In [23]:
TRAINER.train()

***** Running training *****
  Num examples = 19
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=9, training_loss=0.8823407491048177, metrics={'train_runtime': 12.4709, 'train_samples_per_second': 4.571, 'train_steps_per_second': 0.722, 'total_flos': 589463128800.0, 'train_loss': 0.8823407491048177, 'epoch': 3.0})

In [24]:
TRAINER.save_model(f"bert_ner_il_od-with-gpu-{NUM_EXAMPLES}.model")

Saving model checkpoint to bert_ner_il_od-with-gpu-10.model
Configuration saved in bert_ner_il_od-with-gpu-10.model/config.json
Model weights saved in bert_ner_il_od-with-gpu-10.model/pytorch_model.bin
