# Procedural

Mount the my drive and create a folder for the data if it doesn't already exist

In [1]:
# Mount my drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for the data if it does not already exist
import os
if not os.path.exists('/content/drive/MyDrive/MastersProject/data/'):
    os.makedirs('/content/drive/MyDrive/MastersProject/data/')
    print("Created the folder!")
else:
    print("Folder already existed!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder already existed!


In [2]:
!pip install transformers



In [3]:
import transformers
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
RANDOM_SEED = 42

# Prepare the data for BERT

Set the hyperparameters needed for data preparation

In [5]:
MODEL_NAME = "bert-base-cased"
MAX_LEN = 20
VALID_TEST_PROPORTION = 0.2
BATCH_SIZE = 16
TEXT_CHOSEN = "title"     # in {body, title, text}

Read the dataset as a pandas dataframe

In [6]:
df = pd.read_csv('drive/MyDrive/MastersProject/data/aita_clean.csv')
df['text'] = df["title"] + " " + df["body"].fillna("")

Specify the BERT dataset class

In [7]:
class AITADataset(Dataset):
    # Upon onject instance creation, you feed the text samples, their targets, the tokeniser and the max length.
    def __init__(self, texts, targets, tokeniser, max_len):
        self.texts = texts
        self.targets = targets
        self.tokeniser = tokeniser
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    # This method is called when a batch is created. "item" is the index of each sample to be in batch.
    def __getitem__(self, item):
        # Normally it is already a string
        text = str(self.texts[item])

        # Create a dictionary constituting the encoding of the current item (i.e. current text)
        encoding = tokeniser(
            text,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt')
        
        # These are unnecessary I think
        encoding['input_ids'] = encoding['input_ids'].flatten()
        encoding['attention_mask'] = encoding['attention_mask'].flatten()
        
        # In the encoding dictionary for the current text, add the target corresponding to it and the actual test
        dic_out = {'input_ids': encoding['input_ids'],
                   'attention_mask': encoding['attention_mask'],
                   'targets': torch.tensor(self.targets[item], dtype=torch.long),
                   'sample_text': text}
        
        return dic_out

Make a function that creates a dataloader for BERT

In [8]:
def create_data_loader(df, tokeniser, max_len, batch_size, text_chosen):
    '''
    Creates a dataset from the given dataframe and a dataloader spitting batches of the dataset
    '''
    if text_chosen == "title":
        texts = df.title.to_numpy()
    elif text_chosen == "body":
        texts = df.body.to_numpy()
    elif text_chosen == "text":
        texts = df.text.to_numpy()
    else:
        raise ValueError("Invalid TEXT_CHOSEN!")

    ds = AITADataset(
        texts=texts,
        targets=df.is_asshole.to_numpy(),
        tokeniser=tokeniser,
        max_len=max_len)
    
    dataloader = DataLoader(ds, batch_size=batch_size, num_workers=2)
    
    return dataloader

Split the dataframes

In [9]:
df_train, df_test_valid = train_test_split(df, test_size=VALID_TEST_PROPORTION, random_state=RANDOM_SEED)
df_valid, df_test = train_test_split(df_test_valid, test_size=0.5, random_state=RANDOM_SEED)
print("Train dataset:", df_train.shape)
print("Valid dataset:", df_valid.shape)
print("Test dataset:", df_test.shape)

Train dataset: (78102, 10)
Valid dataset: (9763, 10)
Test dataset: (9763, 10)


Save the targets and weights of the dataset split as above

In [10]:
y_train = torch.tensor(df_train['is_asshole'].values)
print(y_train[:15])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1])


In [11]:
y_train = torch.tensor(df_train['is_asshole'].values)
torch.save(y_train, '/content/drive/MyDrive/MastersProject/BERT_outputs/y_train.pt')
del y_train

y_valid = torch.tensor(df_valid['is_asshole'].values)
torch.save(y_valid, '/content/drive/MyDrive/MastersProject/BERT_outputs/y_valid.pt')
del y_valid

y_test = torch.tensor(df_test['is_asshole'].values)
torch.save(y_test, '/content/drive/MyDrive/MastersProject/BERT_outputs/y_test.pt')
del y_test

Initialise the BERT tokeniser based on the chosen model name

In [12]:
tokeniser = transformers.BertTokenizer.from_pretrained(MODEL_NAME)

Create a dataloader from the small dataframe to be overfit

In [13]:
train_loader = create_data_loader(df_train, tokeniser, MAX_LEN, BATCH_SIZE, TEXT_CHOSEN)
valid_loader = create_data_loader(df_valid, tokeniser, MAX_LEN, BATCH_SIZE, TEXT_CHOSEN)
test_loader = create_data_loader(df_test, tokeniser, MAX_LEN, BATCH_SIZE, TEXT_CHOSEN)

Inspect a batch from the dataloader

In [14]:
num_tokens_to_print_per_text = 100
train_data_batch = next(iter(train_loader))
loader_keys = train_data_batch.keys()
print("Each dataloader batch is like a dictionary with keys:", loader_keys)
print(100*"-")
print("Shapes:")
print()
print("input_ids:         ", train_data_batch['input_ids'].shape)
print("attention_mask:    ", train_data_batch['attention_mask'].shape)
print("targets:           ", train_data_batch['targets'].shape)
print("sample_text:       ", train_data_batch['targets'].shape)
print(100*"-")
print("Here are the first {} tokens of the first 5 tokenised texts in the batch:".format(num_tokens_to_print_per_text))
print()
for i in range(0, 5):
  current_sample_token_ids = train_data_batch['input_ids'][i,0:num_tokens_to_print_per_text]
  current_sample_tokens = tokeniser.convert_ids_to_tokens(current_sample_token_ids)
  print(current_sample_tokens)

Each dataloader batch is like a dictionary with keys: dict_keys(['input_ids', 'attention_mask', 'targets', 'sample_text'])
----------------------------------------------------------------------------------------------------
Shapes:

input_ids:          torch.Size([16, 20])
attention_mask:     torch.Size([16, 20])
targets:            torch.Size([16])
sample_text:        torch.Size([16])
----------------------------------------------------------------------------------------------------
Here are the first 100 tokens of the first 5 tokenised texts in the batch:

['[CLS]', 'AI', '##TA', 'for', 'declining', 'to', 'attend', 'and', 'handle', 'my', 'half', 'sisters', 'funeral', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'AI', '##TA', 'Food', 'Safety', 'in', 'Austin', ',', 'Texas', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'AI', '##TA', 'for', 'wanting', 'to', 'have', 'our', 'wedding', '2', 'months'

# Create BERT and send the primary data through it

In [15]:
BERT_OUT_CHOSEN = "full"      # in {pooled, full}

Use GPU if available

In [16]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.empty_cache()
print(device)

cuda:0


Instantiate BERT using a custom configuration and freeze it

In [17]:
bert_config = transformers.BertConfig(vocab_size=28996,
                                      hidden_size=768,
                                      num_hidden_layers=12,
                                      num_attention_heads=12,
                                      max_position_embeddings=512)

bert_model = transformers.BertModel.from_pretrained(MODEL_NAME, config=bert_config)
# Freeze BERT so that its weights are not further fine-tuned from their pretrained values and when samples are passed into it, grads are not stored in the RAM
for param in bert_model.parameters():
    param.requires_grad = False
bert_model = bert_model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Get BERT's pooled output for the training/validation/test data

In [18]:
data_to_get = "test"    # In {train, test, valid}

In [19]:
# Choose whether to feed the batches of the train, valid or test loader to BERT
if data_to_get == "test":
  loader = test_loader
elif data_to_get == "train":
  loader = train_loader
elif data_to_get == "valid":
  loader = valid_loader
else:
  raise ValueError ("Invalid data_to_get selected")

del test_loader
del train_loader
del valid_loader

for i, data_batch in enumerate(loader):
    bert_output = bert_model(data_batch['input_ids'].to(device), data_batch['attention_mask'].to(device))

    if i == 0:
        if BERT_OUT_CHOSEN == "pooled":
            X = bert_output['pooler_output']
        elif BERT_OUT_CHOSEN == "full":
            X = bert_output['last_hidden_state']
        else:
            raise ValueError("Invalid BERT_OUT_CHOSEN")
    else:
        if BERT_OUT_CHOSEN == "pooled":
            X = torch.cat((X, bert_output['pooler_output']), 0)
        elif BERT_OUT_CHOSEN == "full":
            X = torch.cat((X, bert_output['last_hidden_state']), 0)
        else:
            raise ValueError("Invalid BERT_OUT_CHOSEN")

    del bert_output
    torch.cuda.empty_cache()

    if (i + 1) % 10 == 0:
        print("Batch #{} through!".format(i + 1))

print(X.shape)

Batch #10 through!
Batch #20 through!
Batch #30 through!
Batch #40 through!
Batch #50 through!
Batch #60 through!
Batch #70 through!
Batch #80 through!
Batch #90 through!
Batch #100 through!
Batch #110 through!
Batch #120 through!
Batch #130 through!
Batch #140 through!
Batch #150 through!
Batch #160 through!
Batch #170 through!
Batch #180 through!
Batch #190 through!
Batch #200 through!
Batch #210 through!
Batch #220 through!
Batch #230 through!
Batch #240 through!
Batch #250 through!
Batch #260 through!
Batch #270 through!
Batch #280 through!
Batch #290 through!
Batch #300 through!
Batch #310 through!
Batch #320 through!
Batch #330 through!
Batch #340 through!
Batch #350 through!
Batch #360 through!
Batch #370 through!
Batch #380 through!
Batch #390 through!
Batch #400 through!
Batch #410 through!
Batch #420 through!
Batch #430 through!
Batch #440 through!
Batch #450 through!
Batch #460 through!
Batch #470 through!
Batch #480 through!
Batch #490 through!
Batch #500 through!
Batch #51

In [20]:
torch.save(X, '/content/drive/MyDrive/MastersProject/BERT_outputs/X_{}_{}_{}.pt'.format(data_to_get, TEXT_CHOSEN, BERT_OUT_CHOSEN))
# to load: X_train_pooled = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/X_train_pooled.pt', map_location=torch.device('cpu'))