In [1]:
# thrid party imports
import sys
sys.path.append('../BERT_geoparser/')
import numpy as np
import pandas as pd
from sklearn.utils import class_weight
# local imports
from train_model import Trainer
from tokenizer import Tokenizer
from data import Data
from model import BertModel
from analysis import Results


# 1. Fine-tuning a BERT language model on NER data
In this notebook we use the `BERT_geoparser` package to build and fine tune a BERT model to perform Named Entity Recognition (NER) tasks. This is the first step in a multi-step process to build and train a BERT model to identify target and incidental locations within text. 

We use an NER dataset labelled using the B-I-O format, with 8 categories of word - location (`geo`), time (`tim`), organization (`org`), person (`per`), geo-political entity (`gpe`), art/culture (`art`), event (`eve`) or nature (`nat`). Each tag can indicate whether a word is the *begining* of a related phrase (`B`) or *inside* a phrase (`I`). Words which do not belong to any category are given the *outer* tag (`O`). Specialtokens indicating the start (`CLS`) and end (`SEP`) of a sentence are also added. For example, the phrase:

<p style="text-align: center;"><span style="color:red">Jane</span> visited <span style="color:green">Madisson Square Gardens</span> while in <span style="color:yellow">New York</span>.</p>

Would receive the tags:

<p style="text-align: center;"> [CLS] <span style="color:red"> [B-PER] </span> [O] <span style="color:green">[B-ORG] [I-ORG] [I-ORG]</span> [O] [O] <span style="color:yellow">[B-GEO] [I-GEO]</span> [SEP] </p>

The Fine tuned bert model can then estimate the most likely sequence of tags for a given sentence, and can provide the confidence on the given tags.

## 1.1 Initial training on the CoNLL dataset
We will initially train the model on CoNLL-2003 dataset, before retraining the same model on the WikiNeural dataset. This will provide the model with a large number of trainingexamples, while ensuring it is optimized to handel wikipedia style data. This can also be done by running the script `train_model_conll2003.sh` in a Linux Torque environment.

Running this cell will train and test the model. The precision, recall and F1 score in each category are given in the `CoNLL_test_results.txt` file.


In [None]:
trainer = Trainer(data_path = r'../data/NB1/train_CoNLL_dataset.csv', 
                  model_size = 'large',
                  cased = True, 
                  learning_rate=2e-6,
                  max_len=80,
                  saved_model=False)

trainer.train(save_as='r../models/TopoBERT_CoNLL.hdf5',
              n_epochs=20,
              batch_size=4,
              val_split=0.1) 
              
trainer.test(test_data=r'../data/NB1/test_CoNLL_dataset.csv',
             results_filename=r'../results/NB1/CoNLL_test_results.txt')

## 1.2 Retraining on the WikiNeural dataset
We can now retrain the saved model on Wikipedia data, using the WikiNeural dataset. Once again, this can be run using `train_model_wikineural.sh`. Test results are given in `wikineural_test_results.txt`.

In [3]:
trainer = Trainer(data_path = 'wikineural_train_dataset.csv', 
                  model_size = 'Large',
                  cased = True, 
                  learning_rate=2e-6,
                  max_len=80,
                  saved_model=r'../models/TopoBERT.hdf5')

trainer.train(save_as='../models/TopoBERT_WikiNeural.hdf5',
              n_epochs=5,
              batch_size=2,
              val_split=0.1) 
              
trainer.test(test_data=r'../data/NB1/wikineural_test_dataset.csv',
             results_filename=r'../results/NB1/wikineural_test_results.txt')