In [1]:
import sys
sys.executable
import os

In [2]:
os.environ["JAVA_HOME"] = "C:\Java\jdk1.8.0_221"

In [3]:
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver torch 
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver tensorflow
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver transformers 
# !{sys.executable} -m pip install --use-feature=2020-resolver tokenizers
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver datasets
# !{sys.executable} -m pip install --upgrade nltk
# !{sys.executable} -m pip install bert_score
# !{sys.executable} -m pip list

In [4]:
import numpy

import glob
from pathlib import Path
import csv 

import torch 

from transformers import pipeline
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import DistilBertForMaskedLM, DistilBertConfig, DistilBertTokenizer

from tokenizers import BertWordPieceTokenizer

from datasets import  list_datasets, load_dataset, list_metrics, load_metric 

from nltk.tokenize import sent_tokenize # for splitting paragraphs into sentences

In [5]:
# PANDAS WORKAROUND FOR ISSUES WITH LOAD_DATASET
import pandas as pd

read_file = pd.read_csv(
    './Desktop/telemed/data for Huggingface/huggingface_training_data.csv', 
    sep='\n'
)

# convert text datatype to string
read_file['text'] = read_file['text'].astype('string')

# split the pandas data into separate sentences (from paragraphs)
read_file['text'] = read_file['text'].apply(sent_tokenize)

# then explode that dataframe so each sentence is in its own row
read_file = read_file.explode('text')

# convert text datatype to string
read_file['text'] = read_file['text'].astype('string')

# need to drop index or it created issues later
read_file = read_file.reset_index(drop=True)

# remove short strings
read_file = read_file[read_file.text.str.len() > 1]

# DELETE THIS - IT BAD
read_file = read_file[0:100_000]

In [6]:
from sklearn.model_selection import train_test_split

train, evaluate = train_test_split(
    read_file, 
    train_size=0.8, 
    random_state = 123
)

print(train.shape)
print(evaluate.shape)

# write training data and evaluation data to csv
train.to_csv(
    './telemed_Data_Train/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
)

evaluate.to_csv(
    './telemed_Data_Evaluate/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
)

(80000, 1)
(20000, 1)


In [7]:
# write the file with each sentence on a separate line to csv
read_file.to_csv(
    './telemed_Data/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
#     quoting = csv.QUOTE_ALL
)

In [8]:
# create vocab file....
# paths = [str(x) for x in Path("./Desktop/telemed/data for Huggingface/").glob("**/*.csv")]
paths = [str(x) for x in Path("./telemed_Data/").glob("**/*.csv")]
tokenizer =  BertWordPieceTokenizer()

tokenizer.enable_truncation(max_length = 512)
tokenizer.train(files = paths, 
#                 vocab_size = 6000, # too big and we get weird results, too small and we get word pieces...what's just right
                min_frequency = 2,
                special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
               )
tokenizer.save_model("./telemed_Vocab/", name = 'telemed-bert-wordpiece')

['./telemed_Vocab/telemed-bert-wordpiece-vocab.txt']

In [5]:
tokenizer = DistilBertTokenizer('./telemed_Vocab/telemed-bert-wordpiece-vocab.txt')

In [10]:
# tell pytorch what kind of processor is available to it
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [11]:
# Is there a simpler way to build training dataset?
datasetKMG = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data/sentence_data.csv",
    block_size=128, # HOLY COW is the training fast! ... and block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

print(type(datasetKMG))
# https://github.com/huggingface/transformers/blob/762cba3bdaf70104dc17cc7ff0f8ce13ba23d558/src/transformers/data/datasets/language_modeling.py

<class 'transformers.data.datasets.language_modeling.LineByLineTextDataset'>


In [12]:
# build training dataset
datasetTrain = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data_Train/sentence_data.csv",
    block_size=128, # block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

# build evaluation dataset
datasetEvaluate = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data_Evaluate/sentence_data.csv",
    block_size=128, # block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

In [8]:
# explicitly create data collator instead of letting it get implicitly created by trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, 
    mlm = True, 
    mlm_probability = 0.15
)

In [14]:
print(datasetKMG.examples[0])
print(datasetKMG.examples[1])
print(datasetKMG.examples[2]) 
print(datasetKMG.examples[50003])

[2, 302, 28, 244, 4099, 35, 14, 321, 28, 21, 349, 134, 409, 16, 3]
[2, 285, 28, 152, 122, 1570, 1316, 534, 364, 597, 682, 140, 981, 1214, 16, 3]
[2, 109, 4422, 140, 3948, 1197, 2518, 134, 351, 1141, 4336, 140, 351, 712, 37, 838, 5262, 1641, 16, 3]
[2, 109, 2088, 510, 140, 2400, 878, 120, 109, 841, 814, 35, 14, 1377, 140, 1693, 488, 16, 3]


In [15]:
print(tokenizer.decode(datasetKMG.examples[0]))

[CLS] abdomen : may 31 \, 2020 : 3 images are provided. [SEP]


In [9]:
# We might need to use Trainer and TrainingArguments to train FROM SCRATCH!
# https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

config = DistilBertConfig(
    vocab_size = tokenizer.vocab_size, # this is the only default I'm changing
    max_position_embeddings=512, 
    sinusoidal_pos_embds=False, 
    n_layers=6, 
    n_heads=12, 
    dim=768, 
    hidden_dim=3072, 
    dropout=0.1, 
    attention_dropout=0.1, 
    activation='gelu', 
    initializer_range=0.02, 
    qa_dropout=0.1, 
    seq_classif_dropout=0.2, 
    pad_token_id=0
)

model = DistilBertForMaskedLM(config)

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/training_args.py
training_args = TrainingArguments(
    output_dir = './results',          # output directory
    overwrite_output_dir = True,       # whether the contents in the output dir should be overwritten
    do_train = True,                   # whether training should be run
    do_eval = True,                    # whether evaluation should be run
#     evaluate_during_training = True,   # run eval at each logging step - this throws errors
    num_train_epochs = 2,              # total number of training epochs (2-4, rec)
    per_device_train_batch_size = 8,   # batch size per device during training
    per_device_eval_batch_size = 8,    # batch size per devive during eval
    warmup_steps = 0,                  # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_steps = 5000,              # controls how frequently progress is logged; default = 500
    save_steps = 5000,                 # controls how frequently checkpoints are logged; default = 500
    logging_dir = './logs',            # directory for storing logs
    seed = 123                         # random seed
)

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/trainer.py
trainer = Trainer(
    args = training_args,              # training arguments, defined above
    tokenizer = tokenizer,
    data_collator = data_collator,
    train_dataset = datasetTrain, #datasetKMG, 
    eval_dataset = datasetEvaluate, # TrainOutput(global_step=60000, training_loss=1.7492684895833333)
#     compute_metrics = compute_metrics,
    model = model     
)

NameError: name 'datasetTrain' is not defined

In [17]:
# so the model trainer knows what the mask token is, good.
print(trainer.tokenizer.mask_token)

[MASK]


In [18]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10000.0, style=ProgressStyle(description_…

{'loss': 4.042169921875, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.5, 'total_flos': 433336842705888, 'step': 5000}
{'loss': 2.4209765625, 'learning_rate': 2.5e-05, 'epoch': 1.0, 'total_flos': 867413188698624, 'step': 10000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10000.0, style=ProgressStyle(description_…

{'loss': 1.898679296875, 'learning_rate': 1.25e-05, 'epoch': 1.5, 'total_flos': 1303323792844032, 'step': 15000}
{'loss': 1.69612890625, 'learning_rate': 0.0, 'epoch': 2.0, 'total_flos': 1735940465682336, 'step': 20000}




TrainOutput(global_step=20000, training_loss=2.514488671875)

In [19]:
# # this does a pretty print-out of model architecture
# # TODO: confirm that nothing is being left out of this print-out
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))
# print('==== Embedding Layer ====\n')
# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
# print('\n==== First Transformer ====\n')
# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
# print('\n==== Output Layer ====\n')
# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [20]:
model.save_pretrained('./telemed_Model/') 

In [6]:
# to use the fill-mask pipeline (and get reproducible results) we need to take model out of train mode and put in 
# evaluation model. The only way I can find to do this is by using the "from_pretrained" method
modelTrained = model.from_pretrained('./telemed_Model/')

# https://huggingface.co/transformers/migration.html
# Models are now set in evaluation mode by default when instantiated with the from_pretrained() method. 
# To train them don’t forget to set them back in training mode (model.train()) to activate the dropout modules.

NameError: name 'model' is not defined

In [22]:
from transformers import pipeline, FillMaskPipeline

fill_mask = FillMaskPipeline(
    model = modelTrained, 
    tokenizer = tokenizer,
    topk = 10
)

fill_mask.save_pretrained('./telemed_Fill_Mask_Pipeline/') 

# print(fill_mask.tokenizer.mask_token) ## [MASK]
# print(tokenizer.mask_token) ## [MASK]