In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import requests
import random
import pickle
from bs4 import BeautifulSoup
from internetarchive import search_items, get_item, Search

from torch.utils.data.dataloader import DataLoader
from transformers import BertForTokenClassification, AdamW, BertTokenizer, BertTokenizerFast, BatchEncoding, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import List

In [4]:
from ner_pipeline.scrape_for_training import do_search, prepare_data
from ner_pipeline.containers import TraingingBatch
from ner_pipeline.dataset_ner import TrainingDataset
from ner_pipeline.labelset import LabelSet

In [5]:
# Do search
il_od: str = "iliad OR odyssey AND mediatype:texts"  # 771,646 with full_text_search, 6240 without
search_res: Search = do_search(keyword_string=il_od)

Search string: iliad OR odyssey AND mediatype:texts
Results: 539339


In [6]:
pattern = r'Iliad\s\d{1,2}\.\d{1,4}|Il\.*\s\d{1,2}\.\d{1,4}|Iliad\s.[ivxlcdm]*\.\s*\d{1,4}| \
            Il\.*\s.[ivxlcdm]*\.\s*\d{1,4}|book\s*.[ivxlcdm]\.\sline\s*\d{1,4}| \
            Odyssey\s\d{1,2}\.\d{1,4}|Od\.*\s\d{1,2}\.\d{1,4}|Odyssey\s.[ivxlcdm]*\.\s*\d{1,4}| \
            Od\.*\s.[ivxlcdm]*\.\s*\d{1,4}'

In [7]:
labeled_data = prepare_data(search_res, pattern, num_of_pos = 100, num_of_neg = 100)

Successfully got 100 positive data and 100 negative data by scraping 86 books!


In [8]:
print(len(labeled_data))

200


In [9]:
random.shuffle(labeled_data)

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
label_set = LabelSet(labels=["Citation"]) #Only one label in this dataset

In [11]:
il_od_ner_trainingData = TrainingDataset(
    data=labeled_data, tokenizer=tokenizer, label_set=label_set, tokens_per_batch=16
)

In [12]:
print(il_od_ner_trainingData[:3])

[TrainingExample(input_ids=[11345, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], labels=[0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]), TrainingExample(input_ids=[170, 2246, 1942, 3292, 12607, 3292, 4894, 172, 1643, 119, 152, 1181, 119, 1407, 119, 5539], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], labels=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 3]), TrainingExample(input_ids=[170, 1964, 2346, 1182, 1107, 12353, 117, 1290, 170, 1964, 1708, 23140, 10182, 1144, 1103, 175], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], labels=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]


In [13]:
dataset_size = len(il_od_ner_trainingData)
dataset_train = il_od_ner_trainingData[:dataset_size*7//10]
dataset_train_size = len(dataset_train)
dataset_eval = il_od_ner_trainingData[dataset_size*7//10:dataset_size*7//10+dataset_size*3//20]
dataset_eval_size = len(dataset_eval)
dataset_test = il_od_ner_trainingData[dataset_size*7//10+dataset_size*3//20:]
dataset_test_size = len(dataset_test)

In [14]:
# Save train/eval/test
with open("labeled_data/train_" + str(dataset_train_size) + "_of_" + str(dataset_size) + ".pickle", "wb") as dataset_train_file:
    pickle.dump(dataset_train, dataset_train_file)

# # load the object
# with open("labeled_data/train_" + str(dataset_train_size) + "_of_" + str(dataset_size) + ".pickle", "rb") as dataset_train_file:
#     p2 = pickle.load(dataset_train_file)

with open("labeled_data/eval_" + str(dataset_eval_size) + "_of_" + str(dataset_size) + ".pickle", "wb") as dataset_eval_file:
    pickle.dump(dataset_eval, dataset_eval_file)

with open("labeled_data/test_" + str(dataset_test_size) + "_of_" + str(dataset_size) + ".pickle", "wb") as dataset_test_file:
    pickle.dump(dataset_test, dataset_test_file)

In [15]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(il_od_ner_trainingData.label_set.ids_to_label.values())
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [16]:
training_args = TrainingArguments("test_trainer")

In [17]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=dataset_train, eval_dataset=dataset_eval
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 213
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=81, training_loss=0.27005383997787663, metrics={'train_runtime': 132.1997, 'train_samples_per_second': 4.834, 'train_steps_per_second': 0.613, 'total_flos': 6608191917600.0, 'train_loss': 0.27005383997787663, 'epoch': 3.0})

In [19]:
trainer.save_model('bert_ner_il_od-with-gpu-100.model')

Saving model checkpoint to bert_ner_il_od-with-gpu-100.model
Configuration saved in bert_ner_il_od-with-gpu-100.model/config.json
Model weights saved in bert_ner_il_od-with-gpu-100.model/pytorch_model.bin
