In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import requests
import random
import pickle
from bs4 import BeautifulSoup
from internetarchive import search_items, get_item, Search

from torch.utils.data.dataloader import DataLoader
from transformers import BertForTokenClassification, AdamW, BertTokenizer, BertTokenizerFast, BatchEncoding, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import List

In [4]:
from ner_pipeline.scrape_for_training import do_search, prepare_data
from ner_pipeline.containers import TraingingBatch
from ner_pipeline.dataset_ner import TrainingDataset
from ner_pipeline.labelset import LabelSet

In [5]:
# Do search
il_od: str = "iliad OR odyssey AND mediatype:texts"  # 771,646 with full_text_search, 6240 without
search_res: Search = do_search(keyword_string=il_od)

Search string: iliad OR odyssey AND mediatype:texts
Results: 539569


In [6]:
pattern = r'Iliad\s\d{1,2}\.\d{1,4}|Il\.*\s\d{1,2}\.\d{1,4}|Iliad\s.[ivxlcdm]*\.\s*\d{1,4}| \
            Il\.*\s.[ivxlcdm]*\.\s*\d{1,4}|book\s*.[ivxlcdm]\.\sline\s*\d{1,4}| \
            Odyssey\s\d{1,2}\.\d{1,4}|Od\.*\s\d{1,2}\.\d{1,4}|Odyssey\s.[ivxlcdm]*\.\s*\d{1,4}| \
            Od\.*\s.[ivxlcdm]*\.\s*\d{1,4}'

In [7]:
labeled_data = prepare_data(search_res, pattern, num_of_pos = 10000, num_of_neg = 10000)

Successfully got 100 positive data and 100 negative data by scraping 87 books!


In [8]:
print(len(labeled_data))

200


In [9]:
random.shuffle(labeled_data)

In [21]:
dataset_size = len(labeled_data)

dataset_train = labeled_data[:dataset_size*17//20]
dataset_train_size = len(dataset_train)

# dataset_eval = labeled_data[dataset_size*7//10+dataset_size*3//20:]
# dataset_eval_size = len(dataset_eval)

dataset_test = labeled_data[dataset_size*17//20:]
dataset_test_size = len(dataset_test)

print("Number of instances for training: " + str(dataset_train_size))
print("Number of instances for testing: " + str(dataset_test_size))

Number of instances for training: 170
Number of instances for testing: 30


In [23]:
# Save instances for training and testing
with open("labeled_data/train_" + str(dataset_train_size) + "_of_" + str(dataset_size) + ".pickle", "wb") as dataset_train_file:
    pickle.dump(dataset_train, dataset_train_file)

with open("labeled_data/test_" + str(dataset_test_size) + "_of_" + str(dataset_size) + ".pickle", "wb") as dataset_test_file:
    pickle.dump(dataset_test, dataset_test_file)

In [11]:
# # load the object
# with open("labeled_data/model_" + str(dataset_model_size) + "_of_" + str(dataset_size) + ".pickle", "rb") as dataset_model_file:
#     p2 = pickle.load(dataset_model_file)
# print(p2)

In [24]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
label_set = LabelSet(labels=["Citation"]) #Only one label in this dataset

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /Users/ave/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at /Users/ave/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /Users/ave/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed

In [32]:
il_od_ner_trainingData = TrainingDataset(
    data=dataset_train, tokenizer=tokenizer, label_set=label_set, tokens_per_batch=16
)
# print(len(il_od_ner_trainingData))

In [33]:
print(il_od_ner_trainingData[:1])

[TrainingExample(input_ids=[155, 2036, 2559, 119, 8784, 2249, 11414, 9314, 14569, 17243, 11780, 117, 141, 119, 171, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], labels=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100])]


In [36]:
il_od_ner_train = il_od_ner_trainingData[:len(il_od_ner_trainingData)*7//10]
il_od_ner_eval = il_od_ner_trainingData[len(il_od_ner_trainingData)*7//10:]
print("Size of dataset for training: " + str(len(il_od_ner_train)))
print("Size of dataset for eval: " + str(len(il_od_ner_eval)))

Size of dataset for training: 179
Size of dataset for eval: 77


In [37]:
# # Get the label list
# print(il_od_ner_trainingData.label_set.ids_to_label.values())

In [38]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(il_od_ner_trainingData.label_set.ids_to_label.values())
)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /Users/ave/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tran

In [30]:
training_args = TrainingArguments("test_trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [39]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=il_od_ner_train, eval_dataset=il_od_ner_eval
)

In [40]:
trainer.train()

***** Running training *****
  Num examples = 179
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 69


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=69, training_loss=0.3468679621599723, metrics={'train_runtime': 112.3489, 'train_samples_per_second': 4.78, 'train_steps_per_second': 0.614, 'total_flos': 5553363160800.0, 'train_loss': 0.3468679621599723, 'epoch': 3.0})

In [41]:
trainer.save_model('bert_ner_il_od-with-gpu-10000.model')

Saving model checkpoint to bert_ner_il_od-with-gpu-100.model
Configuration saved in bert_ner_il_od-with-gpu-100.model/config.json
Model weights saved in bert_ner_il_od-with-gpu-100.model/pytorch_model.bin
