In [None]:
from tensorflow.python.client import device_lib

[x.physical_device_desc for x in device_lib.list_local_devices() if x.device_type == 'GPU']

['device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0',
 'device: 1, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:05.0, compute capability: 6.0']

In [2]:
%env GOOGLE_APPLICATION_CREDENTIALS=/home/key.json

env: GOOGLE_APPLICATION_CREDENTIALS=/home/key.json


In [None]:
!pip install transformers
!pip install tokenizers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 46.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
import time
import argparse
import os

from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

from transformers import BertConfig, BertForMaskedLM
from transformers import RobertaTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [None]:
def create_tokenizer(files, vocab_size, min_freq, max_len, save_path):
    tokenizer = ByteLevelBPETokenizer() 
    tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
    ])
    tokenizer.save_model(save_path)
    tokenizer = ByteLevelBPETokenizer(save_path+"vocab.json", save_path+"merges.txt", )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=max_len)

    tokenizer.save(save_path+"tokenizer.json")

In [None]:
class LegalDataset(Dataset):
  def __init__(self, text):
    self.encodings = text

  def __len__(self):
    return len(self.encodings)

  def __getitem__(self, index):
    item = {"input_ids": torch.tensor(self.encodings.iloc[index])}
    return item


def process_text(filename, name, map_tokenize, encoding):
    print("Opening file...")
    file = open(filename, "r", encoding=encoding)
    text = file.readlines() # list
    file.close()
    text = pd.Series(text)
    tqdm.pandas(desc="Tokenizing")
    text = text.progress_map(map_tokenize)
    dataset = LegalDataset(text)
    text = None
    occ = filename.rfind("/") + 1
    path = filename[:occ]
    torch.save(dataset, path+name+".pt")
    return path+name+".pt"

In [None]:
from types import SimpleNamespace

args = SimpleNamespace(
    tokenizer='/content/model/tokenizer/',
    files='/content/text/all_in_one.txt',
    encoding='utf8',
    vocab_size=32000,
    min_freq=2,
    sequence_len=512,
    model_path='/content/model',
    dataset='/content/text/new_dataset.pt',
    dataset_name='new_dataset',
    mlm_prob=0.15,
    hidden_layers=12,
    hidden_size=768,
    attention_heads=12,
    epochs=40,
    batch_size=8,
    max_steps=0,
    lrate=1e-4,
    b1=0.9,
    b2=0.99,
    wdecay=0.01,
    scheduler='linear',
    warmup_steps=10_000,
    checkpoint='/content/checkpoints',
    save_steps=10_000,
    save_limit=5,
    resume='/content/checkpoints/checkpoint-480000'
)

In [None]:
# Create directories
try:
    os.mkdir(args.model_path)
except OSError as err: 
    pass

In [None]:
# Create Tokenizer
tokenizer_path = None
if args.tokenizer==None :
    print("Creating new tokenizer")
    tokenizer_path = args.model_path+"/tokenizer/"
    try:
        os.mkdir(tokenizer_path)
    except OSError as err:
        print()
    create_tokenizer(args.files, args.vocab_size, args.min_freq, args.sequence_len, tokenizer_path)
else:
    tokenizer_path = args.tokenizer
    print("Using tokenizer from", tokenizer_path)

Using tokenizer from /content/model/tokenizer/


In [None]:
# Load Tokenizer
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path, max_len=args.sequence_len)

In [None]:
# Create lamda tokenizing function
def map_tokenize(text):
    return tokenizer.encode(text, max_length=args.sequence_len, truncation=True)

In [None]:
# Process Text
dataset_path = None
if args.dataset == None :
    print("Processing text")
    dataset_path = process_text(args.files, args.dataset_name, map_tokenize, args.encoding)
else:
    dataset_path = args.dataset
    print("Using dataset from", dataset_path)

# Load Dataset
dataset = torch.load(dataset_path) 

Using dataset from /content/text/new_dataset.pt


In [None]:
# Create Masked Language Model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=args.mlm_prob
)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizer(name_or_path='/content/model/tokenizer/', vocab_size=32000, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_ten

In [None]:
config = BertConfig(
    vocab_size=args.vocab_size,
    max_position_embeddings=args.sequence_len,
    num_hidden_layers=args.hidden_layers,    #L
    hidden_size=args.hidden_size,        #H
    num_attention_heads=args.attention_heads,  #A
    type_vocab_size=1,
)


model = BertForMaskedLM(config=config)

In [None]:
training_args = TrainingArguments(
    output_dir=args.checkpoint,
    overwrite_output_dir=True,
    num_train_epochs=args.epochs,
    per_device_train_batch_size=args.batch_size,
    save_steps=args.save_steps,
    save_total_limit=args.save_limit,
    prediction_loss_only=True,
    max_steps=args.max_steps,
    learning_rate=args.lrate,
    adam_beta1=args.b1,
    adam_beta2=args.b2,
    weight_decay=args.wdecay,
    lr_scheduler_type=args.scheduler,
    warmup_steps=args.warmup_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [None]:
# Train
if args.resume == None :
    print("Pre-training BERT model")
    trainer.train()
else:
    print("Pre-training BERT model from checkpoint", args.resume)
    trainer.train(resume_from_checkpoint=args.resume)

# Save model
print("Saving model at", args.model_path)
trainer.save_model(args.model_path)

Loading model from /content/checkpoints/checkpoint-480000).


Pre-training BERT model from checkpoint /content/checkpoints/checkpoint-480000


***** Running training *****
  Num examples = 214315
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 535800
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 35
  Continuing training from global step 480000
  Will skip the first 35 epochs then the first 11175 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/11175 [00:00<?, ?it/s]



Step,Training Loss
480500,1.9681
481000,1.955
481500,1.9738
482000,1.9642
482500,1.948
483000,1.9583
483500,1.955
484000,1.9436
484500,1.9602
485000,1.9504


Saving model checkpoint to /content/checkpoints/checkpoint-490000
Configuration saved in /content/checkpoints/checkpoint-490000/config.json
Model weights saved in /content/checkpoints/checkpoint-490000/pytorch_model.bin
Deleting older checkpoint [/content/checkpoints/checkpoint-440000] due to args.save_total_limit
Saving model checkpoint to /content/checkpoints/checkpoint-500000
Configuration saved in /content/checkpoints/checkpoint-500000/config.json
Model weights saved in /content/checkpoints/checkpoint-500000/pytorch_model.bin
Deleting older checkpoint [/content/checkpoints/checkpoint-450000] due to args.save_total_limit
Saving model checkpoint to /content/checkpoints/checkpoint-510000
Configuration saved in /content/checkpoints/checkpoint-510000/config.json
Model weights saved in /content/checkpoints/checkpoint-510000/pytorch_model.bin
Deleting older checkpoint [/content/checkpoints/checkpoint-460000] due to args.save_total_limit
Saving model checkpoint to /content/checkpoints/chec

Saving model at /content/model


Model weights saved in /content/model/pytorch_model.bin


In [4]:
from google.cloud import storage

client = storage.Client()
input_bucket = client.get_bucket('oceanic-ner-model')

input_bucket.blob('bert/pytorch_model.bin').upload_from_filename('/content/model/pytorch_model.bin')
input_bucket.blob('bert/training_args.bin').upload_from_filename('/content/model/training_args.bin')
input_bucket.blob('bert/config.json').upload_from_filename('/content/model/config.json')

input_bucket.blob('bert/tokenizer/merges.txt').upload_from_filename('/content/model/tokenizer/merges.json')
input_bucket.blob('bert/tokenizer/tokenizer.json').upload_from_filename('/content/model/tokenizer/tokenizer.json')
input_bucket.blob('bert/tokenizer/vocab.json').upload_from_filename('/content/model/tokenizer/vocab.json')