# Group 6 Question 6 - Huggingface BertWordPieceTokenizer
Clarissa Cheam, Kavya Jaganathan and Ayushi Mishra

In [None]:
!pip install tokenizers
!pip install transformers
from tokenizers import BertWordPieceTokenizer
import os
from transformers import BertTokenizer
import math
from collections import Counter
from tabulate import tabulate
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk.corpus import stopwords

Collecting tokenizers
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 4.3 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.4
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |███████████████

# **Training Bert Model**

UPLOAD SOURCE TEXT AT ROOT

In [None]:
paths = ["/content/source_text.txt"]

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)

tokenizer.train(files=paths, vocab_size=5000, min_frequency=3,
                limit_alphabet=1000, wordpieces_prefix="##",
                special_tokens=[
                  '[PAD]','[UNK]','[CLS]','[SEP]', '[MASK]'
                ])

os.mkdir('./bert-it')
tokenizer.save_model('./bert-it','bert-it')


['./bert-it/bert-it-vocab.txt']

Loading saved model

In [None]:
tokenizer = BertTokenizer.from_pretrained('/content/bert-it/bert-it-vocab.txt')



# Tokenize Source Text

In [None]:
with open('/content/source_text.txt', 'r') as file:
            lines = file.read().replace('\n', '')
corpus = tokenizer(lines)

# Split Data into train test and val

In [None]:
train_len = math.floor(0.80*len(corpus['input_ids']))
val_len = math.ceil(0.10*len(corpus['input_ids']))

corpus_train = dict()
corpus_val = dict()
corpus_test = dict()

ii = corpus['input_ids']
tti = corpus['token_type_ids']
am = corpus['attention_mask']

corpus_train['input_ids'] = ii[0:train_len]
corpus_train['token_type_ids'] = tti[0:train_len]
corpus_train['attention_mask'] = am[0:train_len]

corpus_val['input_ids'] = ii[train_len:train_len + val_len]
corpus_val['token_type_ids'] = tti[train_len:train_len + val_len]
corpus_val['attention_mask'] = am[train_len:train_len + val_len]

corpus_test['input_ids'] = ii[train_len + val_len:]
corpus_test['token_type_ids'] = tti[train_len + val_len:]
corpus_test['attention_mask'] = am[train_len + val_len:]


# Build Term Frequency Dictionary

In [None]:
with open('/content/bert-it/bert-it-vocab.txt', 'r') as fp:
    vocab = fp.read().split('\n')

In [None]:
train_vocabulary = []
for i in corpus_train['input_ids']:
  train_vocabulary.append(vocab[i])

val_vocabulary = []
for i in corpus_val['input_ids']:
  val_vocabulary.append(vocab[i])

test_vocabulary = []
for i in corpus_test['input_ids']:
  test_vocabulary.append(vocab[i])

In [None]:
train_count = dict()
val_count = dict()
test_count = dict()

In [None]:
for token in train_vocabulary:
    if token in train_count:
        train_count[token] = train_count[token] + 1
    else:
        train_count[token] = 1

for token in val_vocabulary:
    if token in val_count:
        val_count[token] = val_count[token] + 1
    else:
        val_count[token] = 1

for token in test_vocabulary:
    if token in test_count:
        test_count[token] = test_count[token] + 1
    else:
        test_count[token] = 1

# Summary Statistics

In [None]:
def summary_statistics(frequency_dict, vocabulary, train=False):
  to_remove = {}
  original_size = len(frequency_dict.keys())
  no_of_out_of_vocab = 5000 - original_size
  #for key, value in frequency_dict.items():
    #if value < 3:
        #tot_no_of_unk += value
        #no_of_out_of_vocab += 1
        #to_remove[key] = value
        #print(key, value)
  #for k,v in to_remove.items():
    #frequency_dict.pop(k)
    #frequency_dict['[UNK]'] += v
    
  if '[UNK]' not in frequency_dict.keys():
    tot_no_of_unk = 0
  else:
    tot_no_of_unk = frequency_dict['[UNK]']

  if not train:
    no_of_types_unk = 0
    for token in vocabulary:
        if token not in frequency_dict.keys():
            no_of_types_unk += 1

  tokens_tag = pos_tag(vocabulary)
  counts = Counter(tag for word, tag in tokens_tag)

  text = ' '.join(vocabulary)
  sentences = text.split(".")
  words = text.split(" ")
  if (sentences[len(sentences) - 1] == ""):
      avg = len(words) / len(sentences) - 1
  else:
      avg = len(words) / len(sentences)

  no_of_stop_words = 0
  stop_words = set(stopwords.words('english'))
  for words in vocabulary:
      if words in stop_words:
          no_of_stop_words += 1

  data = []
  data.append(len(vocabulary))
  # Vocabulary Size
  data.append(original_size)
  # Num of UNK tokens
  data.append(tot_no_of_unk)
  # Number of OOV
  data.append(no_of_out_of_vocab)
  # Number of types mapped to UNK
  if train:
      data.append('x')
  else:
      data.append(no_of_types_unk)
  # Number of Stop Words
  data.append(no_of_stop_words)
  # Avg sentence length
  data.append(avg)
  # POS Tagging
  data.append(counts)

  return data

In [None]:
d_train = summary_statistics(train_count, train_vocabulary, train=True)

In [None]:
d_val = summary_statistics(val_count, val_vocabulary, train=False)

In [None]:
d_test = summary_statistics(test_count, test_vocabulary, train=False)

In [None]:
def print_stats(d_train, d_val, d_test):
  table = [['data', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'Custom Metric 1- Average Sentence Length'],
            ['Train'] + d_train[0:7], ['Validation'] + d_val[0:7],
            ['Test'] + d_test[0:7]]
  print(tabulate(table, headers='firstrow', tablefmt='grid'))
  keys_train = []
  val_train = []
  keys_val = []
  val_val = []
  keys_test = []
  val_test = []
  for k, v in d_train[7].items():
      keys_train.append(k)
      val_train.append(v)
  for k, v in d_val[7].items():
      keys_val.append(k)
      val_val.append(v)
  for k, v in d_test[7].items():
      keys_test.append(k)
      val_test.append(v)
  table_train = [keys_train[0:10], val_train[0:10]]
  table_val = [keys_val[0:10], val_val[0:10]]
  table_test = [keys_test[0:10], val_test[0:10]]
  print('vii - Train')
  print(tabulate(table_train, headers='firstrow', tablefmt='grid'))
  print('vii - Validation')
  print(tabulate(table_val, headers='firstrow', tablefmt='grid'))
  print('vii - Test')
  print(tabulate(table_test, headers='firstrow', tablefmt='grid'))

In [None]:
print_stats(d_train, d_val, d_test)

+------------+--------+------+-------+------+-----+--------+--------------------------------------------+
| data       |      i |   ii |   iii |   iv | v   |     vi |   Custom Metric 1- Average Sentence Length |
| Train      | 699822 | 3359 |     6 | 1641 | x   | 164965 |                                    28.3742 |
+------------+--------+------+-------+------+-----+--------+--------------------------------------------+
| Validation |  87478 | 2905 |     0 | 2095 | 0   |  20933 |                                    31.3991 |
+------------+--------+------+-------+------+-----+--------+--------------------------------------------+
| Test       |  87478 | 2936 |     0 | 2064 | 0   |  21622 |                                    32.0902 |
+------------+--------+------+-------+------+-----+--------+--------------------------------------------+
vii - Train
+-------+--------+--------+-------+------+------+-------+------+-------+-------+
|    JJ |    NNP |     NN |    IN |    ( |    : |     , |  