#### After we separated the labels from GIANT text files using GIANTprocessor, we saved the text file as XXX_citation_text.txt and XXX_citation_label.txt. Then we created a bert_giant dirtectory where we would place all the processed GIANT files which contains only text. We are using those processed GIANT text files to create a BERTcitation-custom tokenizer.

In [45]:
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
from bs4 import BeautifulSoup
import glob
import pandas as pd

In [2]:
paths = [str(x) for x in Path('./bert_giant').glob('**/*.txt')]
paths

['bert_giant/300K_citation_text.txt',
 'bert_giant/10K_citation_text.txt',
 'bert_giant/1k_citation_text.txt',
 'bert_giant/200K_citation_text.txt',
 'bert_giant/100K_citation_text.txt',
 'bert_giant/22k_citation_text.txt',
 'bert_giant/1M_citation_text.txt',
 'bert_giant/500_citation_text.txt']

In [35]:
"""
Description:

## During initialization:
a) clean_text — cleans text by removing control characters and replacing all whitespace with spaces.
b) handle_chinese_chars — whether the tokenizer includes spaces around Chinese characters (if found in the dataset).
c) stripe_accents — whether we remove accents, when True this will make é → e, ô → o, etc.
d) lowercase — if True the tokenizer will view capital and lowercase characters as equal; A == a, B == b, etc.

## During train: 
a) vocab_size — the number of tokens in our tokenizer. During later tokenization of text, unknown words will be assigned as [UNK] token which is not ideal. 
We should try to minimize this when possible.

b) min_frequency — minimum frequency for a pair of tokens to be merged.
c) special_tokens — a list of the special tokens that BERT uses.
d) limit_alphabet — maximum number of different characters.
e) workpieces_prefix — the prefix added to pieces of words (like ##board in our earlier examples)

"""

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    strip_accents=True,
    lowercase=True
)
# and train
tokenizer.train(files=paths, vocab_size=50_000_000, min_frequency=1,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=[
                    '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

In [36]:
tokenizer.save_model('./bert_giant', 'BERTcitation')

['./bert_giant/BERTcitation-vocab.txt']

## Test

In [8]:
dataset_path = sorted(glob.glob('/home/mchou001/500citation/*.csv'))

In [11]:
"""
The below block of code will concatanate all the csv files in the downsampled GIANT dataset directories. Each directory contains 219 csv files.
"""
newDataset = []
for file in dataset_path:
    df = pd.read_csv(file, encoding = 'utf-8')
    newDataset.append(df)

concat_files = pd.concat(newDataset, axis = 0 , ignore_index=True)
citationString = concat_files['citationStringAnnotated']
citationString.to_csv("500_citation.txt", header=False, index=None)
print(len(citationString))

505


In [12]:
citationList = []
for line in citationString:
    new_start_tag = "<citation> "
    new_end_tag = " </citation>"
    if line.startswith(""):
        citation_tag = new_start_tag + line + new_end_tag ## adding a new tag to each citation string to separate from each other
        citationList.append(citation_tag)

In [37]:
tokenizer = BertTokenizer.from_pretrained('./bert_giant/BERTcitation-vocab.txt')



In [38]:
soup = BeautifulSoup(str(citationList), "html.parser")

## Example

In [39]:
encoding1 = tokenizer.encode("1978 New Names in Volume 52 Journal of Helminthology 52 04 December 389 http://dx.doi.org/10.1017/s0022149x00017338 10.1017/s0022149x00017338")

In [40]:
print(tokenizer.convert_ids_to_tokens(encoding1))

['[CLS]', '1978', 'new', 'names', 'in', 'volume', '52', 'journal', 'of', 'helminthology', '52', '04', 'december', '389', 'http', ':', '/', '/', 'dx', '.', 'doi', '.', 'org', '/', '10', '.', '1017', '/', 's0022149x00017338', '10', '.', '1017', '/', 's0022149x00017338', '[SEP]']


In [41]:
encoding2 = tokenizer.encode("Matthew W. Veal., Scott A. Shearer., John P. Fulton. Improved Mass Flow Sensing for Yield Monitoring in Grain Combines 2004, Ottawa, Canada August 1 - 4, 2004 American Society of Agricultural and Biological Engineers 2004")

In [42]:
print(tokenizer.convert_ids_to_tokens(encoding2))

['[CLS]', 'matthew', 'w', '.', 'veal', '.', ',', 'scott', 'a', '.', 'shearer', '.', ',', 'john', 'p', '.', 'fulton', '.', 'improved', 'mass', 'flow', 'sensing', 'for', 'yield', 'monitoring', 'in', 'grain', 'combines', '2004', ',', 'ottawa', ',', 'canada', 'august', '1', '-', '4', ',', '2004', 'american', 'society', 'of', 'agricultural', 'and', 'biological', 'engineers', '2004', '[SEP]']


In [43]:
encoding3 = tokenizer.encode("McMillan, Donald C, Naveed Sattar, Dinesh Talwar, Denis St.J O’Reilly, and Colin S McArdle 2000 “Changes in Micronutrient Concentrations Following Anti-Inflammatory Treatment in Patients with Gastrointestinal Cancer.” Nutrition 16 6 Elsevier BV 425–428 10.1016/s0899-9007(00)00270-7")

In [44]:
print(tokenizer.convert_ids_to_tokens(encoding3))

['[CLS]', 'mcmillan', ',', 'donald', 'c', ',', 'naveed', 'sattar', ',', 'dinesh', 'talwar', ',', 'denis', 'st', '.', 'j', 'o', '’', 'reilly', ',', 'and', 'colin', 's', 'mcardle', '2000', '“', 'changes', 'in', 'micronutrient', 'concentrations', 'following', 'anti', '-', 'inflammatory', 'treatment', 'in', 'patients', 'with', 'gastrointestinal', 'cancer', '.', '”', 'nutrition', '16', '6', 'elsevier', 'bv', '425', '–', '428', '10', '.', '1016', '/', 's0899', '-', '9007', '(', '00', ')', '00270', '-', '7', '[SEP]']


## BERT Embedding Example

In [26]:
embedding = tokenizer("1978 New Names in Volume 52 Journal of Helminthology 52 04 December 389 http://dx.doi.org/10.1017/s0022149x00017338 10.1017/s0022149x00017338")
embedding

{'input_ids': [2, 2914, 1737, 6807, 1408, 2845, 2546, 1464, 1404, 95243, 2546, 2552, 3315, 6948, 1439, 30, 19, 19, 1441, 18, 1440, 18, 1433, 19, 1400, 18, 2565, 19, 491248, 1400, 18, 2565, 19, 491248, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [27]:
tokenizer.decode(embedding["input_ids"])

'[CLS] 1978 new names in volume 52 journal of helminthology 52 04 december 389 http : / / dx. doi. org / 10. 1017 / s0022149x00017338 10. 1017 / s0022149x00017338 [SEP]'