In [1]:
import sys
sys.executable
import os

In [2]:
os.environ["JAVA_HOME"] = "C:\Java\jdk1.8.0_221"

In [3]:
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver torch 
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver tensorflow
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver transformers 
# !{sys.executable} -m pip install --use-feature=2020-resolver tokenizers==0.8.1rc2
# !{sys.executable} -m pip install --upgrade --use-feature=2020-resolver datasets
# !{sys.executable} -m pip install --upgrade nltk
# !{sys.executable} -m pip install bert_score
# !{sys.executable} -m pip list

In [4]:
import numpy

import glob
from pathlib import Path
import csv 

import torch 

from transformers import pipeline
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import DistilBertForMaskedLM, DistilBertConfig, DistilBertTokenizer

from tokenizers import BertWordPieceTokenizer

from datasets import  list_datasets, load_dataset, list_metrics, load_metric 

from nltk.tokenize import sent_tokenize # for splitting paragraphs into sentences

In [5]:
# PANDAS WORKAROUND FOR ISSUES WITH LOAD_DATASET
import pandas as pd

read_file = pd.read_csv(
    './Desktop/telemed/data for Huggingface/huggingface_training_data.csv', 
    sep='\n'
)

# convert text datatype to string
read_file['text'] = read_file['text'].astype('string')

# split the pandas data into separate sentences (from paragraphs)
read_file['text'] = read_file['text'].apply(sent_tokenize)

# then explode that dataframe so each sentence is in its own row
read_file = read_file.explode('text')

# convert text datatype to string
read_file['text'] = read_file['text'].astype('string')

# need to drop index or it created issues later
read_file = read_file.reset_index(drop=True)

# remove short strings
read_file = read_file[read_file.text.str.len() > 1]

# DELETE THIS - IT BAD
read_file = read_file[0:300_000]

In [6]:
from sklearn.model_selection import train_test_split

train, evaluate = train_test_split(
    read_file, 
    train_size=0.8, 
    random_state = 123
)

print(train.shape)
print(evaluate.shape)

# write training data and evaluation data to csv
train.to_csv(
    './telemed_Data_Train/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
)

evaluate.to_csv(
    './telemed_Data_Evaluate/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
)

(240000, 1)
(60000, 1)


In [7]:
# write the file with each sentence on a separate line to csv
read_file.to_csv(
    './telemed_Data/sentence_data.csv', 
    index = False, 
    columns = ['text'],
    header = False,
    quoting = csv.QUOTE_NONE,
    escapechar = '\\'
#     quoting = csv.QUOTE_ALL
)

In [8]:
# create vocab file....
# paths = [str(x) for x in Path("./Desktop/telemed/data for Huggingface/").glob("**/*.csv")]
paths = [str(x) for x in Path("./telemed_Data/").glob("**/*.csv")]
tokenizer =  BertWordPieceTokenizer()

tokenizer.enable_truncation(max_length = 512)
tokenizer.train(files = paths, 
                vocab_size = 4000, 
                min_frequency = 2,
                special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
               )
tokenizer.save_model("./telemed_Vocab2/", name = 'telemed-bert-wordpiece')

['./telemed_Vocab2/telemed-bert-wordpiece-vocab.txt']

In [9]:
tokenizer = DistilBertTokenizer('./telemed_Vocab2/telemed-bert-wordpiece-vocab.txt')

In [10]:
# tell pytorch what kind of processor is available to it
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [11]:
# Is there a simpler way to build training dataset?
datasetKMG = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data/sentence_data.csv",
    block_size=128, # HOLY COW is the training fast! ... and block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

print(type(datasetKMG))
# https://github.com/huggingface/transformers/blob/762cba3bdaf70104dc17cc7ff0f8ce13ba23d558/src/transformers/data/datasets/language_modeling.py

<class 'transformers.data.datasets.language_modeling.LineByLineTextDataset'>


In [12]:
# build training dataset
datasetTrain = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data_Train/sentence_data.csv",
    block_size=128, # block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

# build evaluation dataset
datasetEvaluate = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./telemed_Data_Evaluate/sentence_data.csv",
    block_size=128, # block_size is max length of vector...so maybe dial up to 512? (but sentence len > 512? Not likely here.)
)

In [13]:
# explicitly create data collator instead of letting it get implicitly created by trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, 
    mlm = True, 
    mlm_probability = 0.15
)

In [14]:
print(datasetKMG.examples[0])
print(datasetKMG.examples[1])
print(datasetKMG.examples[2]) # No padding. But this can still get run through training
print(datasetKMG.examples[50003])

[2, 308, 28, 270, 21, 96, 36, 14, 352, 28, 21, 451, 137, 455, 16, 3]
[2, 304, 28, 168, 123, 1692, 1493, 687, 461, 530, 839, 133, 959, 1021, 16, 3]
[2, 111, 3022, 128, 133, 3390, 128, 1137, 1657, 137, 320, 1581, 3825, 124, 133, 320, 768, 38, 1079, 942, 3162, 1417, 16, 3]
[2, 111, 1993, 465, 133, 2820, 885, 122, 111, 896, 766, 36, 14, 1560, 133, 1721, 525, 16, 3]


In [15]:
print(tokenizer.decode(datasetKMG.examples[0]))

[CLS] abdomen : may 31 \, 2020 : 3 images are provided. [SEP]


In [16]:
# user can define their own custom metrics 
# UNDER DEVELOPMENT - NOT SURE IF THIS WILL WORK AND WHAT THE LABELS ARE IN THIS CONTEXT
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [22]:
# We might need to use Trainer and TrainingArguments to train FROM SCRATCH!
# https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

config = DistilBertConfig(
    vocab_size = tokenizer.vocab_size, # this is the only default I'm changing
    max_position_embeddings=512, 
    sinusoidal_pos_embds=False, 
    n_layers=6, 
    n_heads=12, 
    dim=768, 
    hidden_dim=3072, 
    dropout=0.1, 
    attention_dropout=0.1, 
    activation='gelu', 
    initializer_range=0.02, 
    qa_dropout=0.1, 
    seq_classif_dropout=0.2, 
    pad_token_id=0
)

model = DistilBertForMaskedLM(config)

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/training_args.py
training_args = TrainingArguments(
    output_dir = './results',          # output directory
    overwrite_output_dir = True,       # whether the contents in the output dir should be overwritten
    do_train = True,                   # whether training should be run
    do_eval = True,                    # whether evaluation should be run
#     evaluate_during_training = True,   # run eval at each logging step - this throws errors
    num_train_epochs = 2,              # total number of training epochs (2-4, rec)
    per_device_train_batch_size = 8,   # batch size per device during training
    per_device_eval_batch_size = 8,    # batch size per devive during eval
    warmup_steps = 0,                  # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = './logs',            # directory for storing logs
    seed = 123                         # random seed
)

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/trainer.py
trainer = Trainer(
    args = training_args,              # training arguments, defined above
    tokenizer = tokenizer,
    data_collator = data_collator,
    train_dataset = datasetTrain, #datasetKMG, 
    eval_dataset = datasetEvaluate,
#     compute_metrics = compute_metrics,
    model = model     
)

In [18]:
# so the model trainer knows what the mask token is, good.
print(trainer.tokenizer.mask_token)

[MASK]


In [23]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=30000.0, style=ProgressStyle(description_…

{'loss': 5.91733349609375, 'learning_rate': 4.958333333333334e-05, 'epoch': 0.016666666666666666, 'step': 500}
{'loss': 5.24743212890625, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.03333333333333333, 'step': 1000}
{'loss': 4.8485107421875, 'learning_rate': 4.875e-05, 'epoch': 0.05, 'step': 1500}
{'loss': 4.5343349609375, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.06666666666666667, 'step': 2000}
{'loss': 4.22444140625, 'learning_rate': 4.791666666666667e-05, 'epoch': 0.08333333333333333, 'step': 2500}
{'loss': 3.9406640625, 'learning_rate': 4.75e-05, 'epoch': 0.1, 'step': 3000}
{'loss': 3.637568359375, 'learning_rate': 4.708333333333334e-05, 'epoch': 0.11666666666666667, 'step': 3500}
{'loss': 3.45575, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.13333333333333333, 'step': 4000}
{'loss': 3.2852890625, 'learning_rate': 4.6250000000000006e-05, 'epoch': 0.15, 'step': 4500}
{'loss': 3.150328125, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.16666666666666666, 

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=30000.0, style=ProgressStyle(description_…

{'loss': 1.397015625, 'learning_rate': 2.4583333333333332e-05, 'epoch': 1.0166666666666666, 'step': 30500}
{'loss': 1.358109375, 'learning_rate': 2.4166666666666667e-05, 'epoch': 1.0333333333333334, 'step': 31000}
{'loss': 1.358234375, 'learning_rate': 2.375e-05, 'epoch': 1.05, 'step': 31500}
{'loss': 1.37621875, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.0666666666666667, 'step': 32000}
{'loss': 1.343390625, 'learning_rate': 2.2916666666666667e-05, 'epoch': 1.0833333333333333, 'step': 32500}
{'loss': 1.35740625, 'learning_rate': 2.25e-05, 'epoch': 1.1, 'step': 33000}
{'loss': 1.32590625, 'learning_rate': 2.2083333333333333e-05, 'epoch': 1.1166666666666667, 'step': 33500}
{'loss': 1.3435, 'learning_rate': 2.1666666666666667e-05, 'epoch': 1.1333333333333333, 'step': 34000}
{'loss': 1.304296875, 'learning_rate': 2.125e-05, 'epoch': 1.15, 'step': 34500}
{'loss': 1.350984375, 'learning_rate': 2.0833333333333336e-05, 'epoch': 1.1666666666666667, 'step': 35000}
{'loss': 1.311734375,

TrainOutput(global_step=60000, training_loss=1.7492684895833333)

In [None]:
# this does a pretty print-out of model architecture
# TODO: confirm that nothing is being left out of this print-out
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [24]:
model.save_pretrained('./telemed_Model/') 

In [None]:
# to use the fill-mask pipeline (and get reproducible results) we need to take model out of train mode and put in 
# evaluation model. The only way I can find to do this is by using the "from_pretrained" method
modelTrained = model.from_pretrained('./telemed_Model2/')

# https://huggingface.co/transformers/migration.html
# Models are now set in evaluation mode by default when instantiated with the from_pretrained() method. 
# To train them don’t forget to set them back in training mode (model.train()) to activate the dropout modules.

In [None]:
from transformers import pipeline, FillMaskPipeline

fill_mask = FillMaskPipeline(
    model = modelTrained, 
    tokenizer = tokenizer,
    topk = 10
)

# print(fill_mask.tokenizer.mask_token) ## [MASK]
# print(tokenizer.mask_token) ## [MASK]

# There are no appreciable mineral opaque calculi within the urinary bladder on the available study.
sequence = f"There are no appreciable mineral opaque {tokenizer.mask_token} within the urinary bladder on the available study."
top_k = fill_mask(sequence)

print("Complete sentence: There are no appreciable mineral opaque calculi within the urinary bladder on the available study.")
print("\n")
print("Sentence with exactly one token masked:")
print(sequence)
print("\n")
print("k sentences with masked token filled and likelihood (probability?) of that token:")
for item in top_k:
    print(item['sequence'])
    print(item['score'])
#     print(item['token'])
#     print(item['token_str'])

In [None]:
# try another sentence 
# sequence = f"There is a mild diffuse {tokenizer.mask_token} she will long pattern, the interstitial component of which is accentuated by expiratory phase of respiration."
sequence = f"There is a mild diffuse {tokenizer.mask_token} she will long pattern, the interstitial component of which is accentuated by expiratory phase of respiration." 
top_k = fill_mask(sequence)

print("Complete sentence: There is a mild diffuse broncho-interspace she will long pattern, the interstitial component of which is accentuated by expiratory phase of respiration.")
print("\n")
print("Sentence with exactly one token masked:")
print(sequence)
print("\n")
print("k sentences with masked token filled and likelihood (probability?) of that token:")
for item in top_k:
    print(item['sequence'])
    print(item['score'])
    
# parenchymal is an adjective!!
# parenchyma is a noun - model knows a noun belongs in that slot

In [None]:
# what kinds of performance metrics come built-in?
# print(list_metrics())

# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and 
# reference sentences by cosine similarity. It has been shown to correlate with human judgment on sentence-level and 
# system-level evaluation. Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for 
# evaluating different language generation tasks.
# https://arxiv.org/pdf/1904.09675.pdf --- shows how BERT score is calculated
metric = load_metric("bertscore")

# Example of typical usage
for batch in read_file['text'][1:2]:
    inputs = torch.tensor(tokenizer.encode(batch, add_special_tokens = True)).unsqueeze(0)
    predictions = modelTrained(input_ids = inputs)[0]
    predicted_index = torch.argmax(input = predictions[0], dim = 1)
    predicted_text = tokenizer.decode(predicted_index)
    print("the original text:")
    print(batch)
    print('\n')
    print("input_ids for the original text:")
    print(inputs)
    print('\n')
    print("m x n predictions matrix: m is number of tokens in sentence; n is dimension of vocab")
    print(predictions)
    print(predictions.shape)
    print('\n')
    print("these are the most likely tokens (ids) for each position in the sentence")
    print(predicted_index)
    print('\n')
    print("these are the most likely token ids converted to text")
    print(predicted_text)
#     metric.add_batch(predictions=["I like to take long walks."], references=["I like to take lengthy walks."])


In [None]:
# score = metric.compute(
#     predictions = ["I like to take lengthy walks.", "I live in a horse"], 
#     references = ["I like to take long walks.", "I live in a house."],
#     lang = 'en'
# )
# print(score)

print(metric)

In [None]:
help(DataCollatorForLanguageModeling)