# Setup
**Install Hugging Face**

[transformers](https://github.com/huggingface/transformers) package from Hugging Face -- pytorch interface for working with BERT

In [1]:
%%capture
!pip install transformers

In [2]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#-----------------------------------------------------#
from transformers import AutoTokenizer
#-----------------------------------------------------#

In [3]:
# universal (for all BERT models)
summer2018 = "C:/Users/stuar/Desktop/NLP/datasets/Summer2018-smoke-alcohol-last-removed/"

# universal (for all BERT models)
text_dataset_folder = "C:/Users/stuar/Desktop/NLP/models/BERT/text_datasets_for_BERT/"

# specific (just for BioELECTRA)
dataloader_folder = "C:/Users/stuar/Desktop/NLP/models/BioELECTRA/dataloaders_for_BioELECTRA/"

# Prepare Functions

**binary splitting**

In [4]:
def binary_data_split(text_dataset, binary_label, seed_val): # using test_size of 20% atm
    
    train_dataset, test_dataset, train_labels, test_labels = train_test_split(text_dataset, binary_labels, 
                                                        test_size=0.20, random_state=seed_val) # was 808
    train_dataset = np.array(train_dataset)
    test_dataset = np.array(test_dataset)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
    
    return train_dataset, test_dataset, train_labels, test_labels

**tokenization**

In [5]:
print('Loading BioELECTRA tokenizer...')
tokenizer = AutoTokenizer.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed")
print("Done", "\n")

Loading BioELECTRA tokenizer...
Done 



In [6]:
def tokenize_documents(text_dataset):

    
    #----------------------------------------------------------------#
    input_ids = [] # Tokenize all of the sentences and map the tokens to thier word IDs.
    lengths = [] # Record the length of each sequence (after truncating to 512).
    print('Tokenizing comments...')
    
    for document in text_dataset: # for every document

        if ((len(input_ids) % 500) == 0): # Report progress.
            print('  Read {:,} comments.'.format(len(input_ids)))

        # `encode` will:
        #   (1) Tokenize the document.
        #   (2) Prepend the `[CLS]` token to the start/Append the `[SEP]` token to the end.
        #   (3) Map tokens to their IDs.
        encoded_doc = tokenizer.encode(
                            document,                  # Document to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            #max_length = 512,         # Truncate all sentences.                        
                            #return_tensors = 'pt',    # Return pytorch tensors.
                        )

        input_ids.append(encoded_doc) # Add the encoded sentence to the list.
        lengths.append(len(encoded_doc)) # Record the truncated length.

    print('DONE.')
    print('{:>10,} comments tokenized'.format(len(input_ids)), "\n")
    #----------------------------------------------------------------#
    
    
    #----------------------------------------------------------------#
    print('Min length: {:,} tokens'.format(min(lengths)))
    print('Max length: {:,} tokens'.format(max(lengths)))
    print('Median length: {:,} tokens'.format(int(np.median(lengths))))
    print('Mean length: {:,} tokens'.format(np.sum(lengths)//len(lengths)))
    #----------------------------------------------------------------#
    
    
    #----------------------------------------------------------------#
    num_over = 0
    for length in lengths:
        if length <= 512:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 512 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 448:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 448 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 384:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 384 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 320:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 320 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 256:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 256 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 192:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 192 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    
    num_over = 0
    for length in lengths:
        if length <= 64:
            num_over += 1
    print('{:,} of {:,} documents ({:.2%}) in this dataset are less than or equal to 64 tokens.'
          .format(num_over, len(lengths), float(num_over) / float(len(lengths))))
    #----------------------------------------------------------------#
    
    
    return input_ids, lengths # lengths for potential data exploration

**padding**

In [7]:
def pad_sequences(input_ids, MAX_LEN): # Set the required sequence length.
    
    print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
    print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
    
    input_ids = sequence.pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", # Pad our input tokens with value 0.
                              value=0, truncating="post", padding="post")
    # truncating="post" --> remove values from sequences longer than maxlen at the END of sequences
    # padding="post" --> pad BEFORE each seqeunce
    print('\nDone.')
    
    return input_ids

**attention masks**

In [8]:
def add_attention_masks(input_ids):
    attention_masks = [] # Create attention masks
    
    for encoded_doc in input_ids: # For each encoded document
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in encoded_doc]
        attention_masks.append(att_mask) # Store the attention mask for this sentence.
        
    return attention_masks

# DataLoader Creation
- **just change the 'name' of the text dataset and binary labels accordingly and run the cells**

In [31]:
name = "JessAble_drug_35b165a"  
f = open(f"{text_dataset_folder}{name}", "rb")
text_dataset = pickle.load(f)
f.close()
len(text_dataset)

2220

In [32]:
name = "JessAble_labels_drug"
f = open(f"{text_dataset_folder}{name}", "rb")
binary_labels = pickle.load(f)
f.close()
len(binary_labels)

2220

**train & validation sets**

In [33]:
# CHANGE ACCORDINGLY
#--------------------------#
SEED = 2001 # set splitting
#--------------------------#
BATCH_SIZE = 32 
MAX_LEN = 64

In [34]:
train_dataset, test_dataset, train_labels, test_labels = binary_data_split(text_dataset, binary_labels, SEED) 
list(train_labels).count(1), list(test_labels).count(1)

(41, 8)

In [35]:
print("TRAINING PREPROCESSING COMMENCING...")

#------------------------------------------------------------#
print("-------------------------------------")
train_dataset, test_dataset, train_labels, test_labels = binary_data_split(text_dataset, binary_labels, SEED) # was 808
print("Data split for binary classification.")
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
input_ids, lengths = tokenize_documents(train_dataset)
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
input_ids = pad_sequences(input_ids, MAX_LEN)
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
attention_masks = add_attention_masks(input_ids)
print("Attention masks created.")
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
train_masks, validation_masks, _, _ = train_test_split(attention_masks, train_labels,
                                                       random_state=SEED, test_size=0.1) # was 2021
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, train_labels, 
                                                       random_state=SEED, test_size=0.1) # was 2021
print("Validation data created.")
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.as_tensor(train_labels, dtype=torch.long)
validation_labels = torch.as_tensor(validation_labels, dtype=torch.long)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
print("Train & Validation data converted to PyTorch tensors.")
#------------------------------------------------------------#

#------------------------------------------------------------#
print("-------------------------------------")
batch_size = BATCH_SIZE # The DataLoader needs to know our batch size for training

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
print("DataLoaders created.")
print("DONE.")

TRAINING PREPROCESSING COMMENCING...
-------------------------------------
Data split for binary classification.
-------------------------------------
Tokenizing comments...
  Read 0 comments.
  Read 500 comments.
  Read 1,000 comments.
  Read 1,500 comments.
DONE.
     1,776 comments tokenized 

Min length: 25 tokens
Max length: 91 tokens
Median length: 43 tokens
Mean length: 44 tokens
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 512 tokens.
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 448 tokens.
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 384 tokens.
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 320 tokens.
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 256 tokens.
1,776 of 1,776 documents (100.00%) in this dataset are less than or equal to 192 tokens.
1,766 of 1,776 documents (99.44%) in this dataset are less than or equal to 

In [36]:
# CHANGE ACCORDINGLY
#---------------------------#
SUBSTANCE_FOLDER = "drug"     
#---------------------------#

In [37]:
f = open(f"{dataloader_folder}{SUBSTANCE_FOLDER}/{SEED}/train_DL_JA_35b165a_32_64", "wb")
pickle.dump(train_dataloader, f)
f.close()

In [38]:
f = open(f"{dataloader_folder}{SUBSTANCE_FOLDER}/{SEED}/valid_DL_JA_35b165a_32_64", "wb")
pickle.dump(validation_dataloader, f)
f.close()

**test set**

In [39]:
test_input_ids, lengths = tokenize_documents(test_dataset)
#------------------------------------------------------------#

#------------------------------------------------------------#
test_input_ids = pad_sequences(test_input_ids, MAX_LEN)
#------------------------------------------------------------#

#------------------------------------------------------------#
test_attention_masks = add_attention_masks(test_input_ids)
#------------------------------------------------------------#

#------------------------------------------------------------#
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.as_tensor(test_labels, dtype=torch.long)
test_masks = torch.tensor(test_attention_masks)
#------------------------------------------------------------#

#------------------------------------------------------------#
batch_size = BATCH_SIZE # Set the batch size. 

# Create the DataLoader.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

Tokenizing comments...
  Read 0 comments.
DONE.
       444 comments tokenized 

Min length: 29 tokens
Max length: 91 tokens
Median length: 43 tokens
Mean length: 44 tokens
444 of 444 documents (100.00%) in this dataset are less than or equal to 512 tokens.
444 of 444 documents (100.00%) in this dataset are less than or equal to 448 tokens.
444 of 444 documents (100.00%) in this dataset are less than or equal to 384 tokens.
444 of 444 documents (100.00%) in this dataset are less than or equal to 320 tokens.
444 of 444 documents (100.00%) in this dataset are less than or equal to 256 tokens.
444 of 444 documents (100.00%) in this dataset are less than or equal to 192 tokens.
442 of 444 documents (99.55%) in this dataset are less than or equal to 64 tokens.

Padding/truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0

Done.


In [40]:
# ISSUE FIXED
train_masks.shape, validation_masks.shape, test_masks.shape

(torch.Size([1598, 64]), torch.Size([178, 64]), torch.Size([444, 64]))

In [41]:
f = open(f"{dataloader_folder}{SUBSTANCE_FOLDER}/{SEED}/test_DL_JA_35b165a_32_64", "wb")
pickle.dump(test_dataloader, f)
f.close()