# Procedural

Mount my drive and create a folder for the data if it doesn't already exist

In [1]:
# Mount my drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for the data if it does not already exist
import os
if not os.path.exists('/content/drive/MyDrive/MastersProject/data/'):
    os.makedirs('/content/drive/MyDrive/MastersProject/data/')
    print("Created the folder!")
else:
    print("Folder already existed!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder already existed!


In [2]:
!pip install transformers



In [3]:
import transformers
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
RANDOM_SEED = 42

# Prepare the data for BERT

Set the hyperparameters needed for data preparation

In [5]:
MODEL_NAME = "bert-base-cased"
MAX_LEN = 512
VALID_TEST_PROPORTION = 0.2
BATCH_SIZE = 4339

Read the dataset as a pandas dataframe

In [6]:
df = pd.read_csv('drive/MyDrive/MastersProject/data/aita_clean.csv')
df['text'] = df["title"] + " " + df["body"].fillna("")

Specify the BERT dataset class

In [7]:
class AITADataset(Dataset):
    # Upon onject instance creation, you feed the text samples, their targets, the tokeniser and the max length.
    def __init__(self, texts, targets, tokeniser, max_len):
        self.texts = texts
        self.targets = targets
        self.tokeniser = tokeniser
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    # This method is called when a batch is created. "item" is the index of each sample to be in batch.
    def __getitem__(self, item):
        # Normally it is already a string
        text = str(self.texts[item])

        # Create a dictionary constituting the encoding of the current item (i.e. current text)
        encoding = tokeniser(
            text,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt')
        
        # These are unnecessary I think
        encoding['input_ids'] = encoding['input_ids'].flatten()
        encoding['attention_mask'] = encoding['attention_mask'].flatten()
        
        # In the encoding dictionary for the current text, add the target corresponding to it and the actual test
        dic_out = {'input_ids': encoding['input_ids'],
                   'attention_mask': encoding['attention_mask'],
                   'targets': torch.tensor(self.targets[item], dtype=torch.long),
                   'sample_text': text}
        
        return dic_out

Make a function that creates a dataloader for BERT

In [8]:
def create_data_loader(df, tokeniser, max_len, batch_size):
    '''
    Creates a dataset from the given dataframe and a dataloader spitting batches of the dataset
    '''
    ds = AITADataset(
        texts=df.text.to_numpy(),
        targets=df.is_asshole.to_numpy(),
        tokeniser=tokeniser,
        max_len=max_len)
    
    dataloader = DataLoader(ds, batch_size=batch_size, num_workers=2)
    
    return dataloader

Split the dataframes

In [9]:
df_train_big, df_test_valid = train_test_split(df, test_size=VALID_TEST_PROPORTION, random_state=RANDOM_SEED)
df_train_1_2_3_4, df_train_5_6 = train_test_split(df_train_big, test_size=1/3, random_state=RANDOM_SEED)
df_train_1_2, df_train_3_4 = train_test_split(df_train_1_2_3_4, test_size=0.5, random_state=RANDOM_SEED)
df_train_5, df_train_6 = train_test_split(df_train_5_6, test_size=0.5, random_state=RANDOM_SEED)
df_train_1, df_train_2 = train_test_split(df_train_1_2, test_size=0.5, random_state=RANDOM_SEED)
df_train_3, df_train_4 = train_test_split(df_train_3_4, test_size=0.5, random_state=RANDOM_SEED)

df_valid, df_test = train_test_split(df_test_valid, test_size=0.5, random_state=RANDOM_SEED)
print("Train dataset big:", df_train_big.shape)
print("Train dataset 1:", df_train_1.shape)
print("Train dataset 2:", df_train_2.shape)
print("Train dataset 3:", df_train_3.shape)
print("Train dataset 4:", df_train_4.shape)
print("Train dataset 5:", df_train_5.shape)
print("Train dataset 6:", df_train_6.shape)
print("Valid dataset:", df_valid.shape)
print("Test dataset:", df_test.shape)

Train dataset big: (78102, 10)
Train dataset 1: (13017, 10)
Train dataset 2: (13017, 10)
Train dataset 3: (13017, 10)
Train dataset 4: (13017, 10)
Train dataset 5: (13017, 10)
Train dataset 6: (13017, 10)
Valid dataset: (9763, 10)
Test dataset: (9763, 10)


Save the targets and weights of the dataset split as above

In [10]:
y_train_1 = torch.tensor(df_train_1['is_asshole'].values, dtype=torch.short)
torch.save(y_train_1, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_1.pt')
del y_train_1

y_train_2 = torch.tensor(df_train_2['is_asshole'].values, dtype=torch.short)
torch.save(y_train_2, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_2.pt')
del y_train_2

y_train_3 = torch.tensor(df_train_3['is_asshole'].values, dtype=torch.short)
torch.save(y_train_3, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_3.pt')
del y_train_3

y_train_4 = torch.tensor(df_train_4['is_asshole'].values, dtype=torch.short)
torch.save(y_train_4, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_4.pt')
del y_train_4

y_train_5 = torch.tensor(df_train_5['is_asshole'].values, dtype=torch.short)
torch.save(y_train_5, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_5.pt')
del y_train_5

y_train_6 = torch.tensor(df_train_6['is_asshole'].values, dtype=torch.short)
torch.save(y_train_6, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_train_6.pt')
del y_train_6

y_valid = torch.tensor(df_valid['is_asshole'].values, dtype=torch.short)
torch.save(y_valid, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/_valid.pt')
del y_valid

y_test = torch.tensor(df_test['is_asshole'].values, dtype=torch.short)
torch.save(y_test, '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/y_test.pt')
del y_test

Initialise the BERT tokeniser based on the chosen model name

In [11]:
tokeniser = transformers.BertTokenizer.from_pretrained(MODEL_NAME)

Create a dataloader from the small dataframe to be overfit

In [12]:
train_loader_big = create_data_loader(df_train_big, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_1 = create_data_loader(df_train_1, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_2 = create_data_loader(df_train_2, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_3 = create_data_loader(df_train_3, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_4 = create_data_loader(df_train_3, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_5 = create_data_loader(df_train_3, tokeniser, MAX_LEN, BATCH_SIZE)
train_loader_6 = create_data_loader(df_train_3, tokeniser, MAX_LEN, BATCH_SIZE)
valid_loader = create_data_loader(df_valid, tokeniser, MAX_LEN, BATCH_SIZE)
test_loader = create_data_loader(df_test, tokeniser, MAX_LEN, BATCH_SIZE)

Inspect a batch from the dataloader

In [13]:
num_tokens_to_print_per_text = 100
train_data_batch = next(iter(train_loader_1))
loader_keys = train_data_batch.keys()
print("Each dataloader batch is like a dictionary with keys:", loader_keys)
print(100*"-")
print("Shapes:")
print()
print("input_ids:         ", train_data_batch['input_ids'].shape)
print("attention_mask:    ", train_data_batch['attention_mask'].shape)
print("targets:           ", train_data_batch['targets'].shape)
print("sample_text:       ", train_data_batch['targets'].shape)
print(100*"-")
print("Here are the first {} tokens of the first 5 tokenised texts in the batch:".format(num_tokens_to_print_per_text))
print()
for i in range(0, 5):
  current_sample_token_ids = train_data_batch['input_ids'][i,0:num_tokens_to_print_per_text]
  current_sample_tokens = tokeniser.convert_ids_to_tokens(current_sample_token_ids)
  print(current_sample_tokens)

Each dataloader batch is like a dictionary with keys: dict_keys(['input_ids', 'attention_mask', 'targets', 'sample_text'])
----------------------------------------------------------------------------------------------------
Shapes:

input_ids:          torch.Size([4339, 512])
attention_mask:     torch.Size([4339, 512])
targets:            torch.Size([4339])
sample_text:        torch.Size([4339])
----------------------------------------------------------------------------------------------------
Here are the first 100 tokens of the first 5 tokenised texts in the batch:

['[CLS]', 'AI', '##TA', 'for', 'not', 'letting', 'my', 'neighbor', 'from', 'the', 'building', 'next', 'door', 'climb', 'M', '##Y', 'roof', 'T', '##LD', '##R', 'at', 'bottom', 'I', '’', 'm', 'on', 'mobile', 'and', 'I', '’', 'm', 'also', 'angry', 'typing', 'So', 'I', 'live', 'in', 'a', 'big', 'city', 'in', 'a', 'very', 'lively', 'neighborhood', '.', 'I', 'live', 'across', 'from', 'bars', ',', 'strip', 'clubs', ',', 'live',

# Create BERT and send the primary data through it

Use GPU if available

In [14]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.empty_cache()
print(device)

cuda:0


Instantiate BERT using a custom configuration and freeze it

In [15]:
bert_config = transformers.BertConfig(vocab_size=28996,
                                      hidden_size=768,
                                      num_hidden_layers=12,
                                      num_attention_heads=12,
                                      max_position_embeddings=MAX_LEN)

bert_model = transformers.BertModel.from_pretrained(MODEL_NAME, config=bert_config)
# Freeze BERT so that its weights are not further fine-tuned from their pretrained values and when samples are passed into it, grads are not stored in the RAM
for param in bert_model.parameters():
    param.requires_grad = False
bert_model = bert_model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Get BERT's pooled output for the training/validation/test data

In [16]:
data_to_get = "train_3"             # In {train_big, train_1, train_2, train_3, test, valid}
pooled_or_unpooled = "unpooled"     # In {pooled, unpooled}

In [17]:
!rm -rf /usr/local/lib/python2.7
!rm -rf /swift
!rm -rf /tensorflow-2.0.0

In [18]:
# Choose whether to feed the batches of the train, valid or test loader to BERT
if data_to_get == "test":
    loader = test_loader
elif data_to_get == "train_big":
    loader = train_loader_big
elif data_to_get == "train_1":
    loader = train_loader_1
elif data_to_get == "train_2":
    loader = train_loader_2
elif data_to_get == "train_3":
    loader = train_loader_3
elif data_to_get == "train_4":
    loader = train_loader_4
elif data_to_get == "train_5":
    loader = train_loader_5
elif data_to_get == "train_6":
    loader = train_loader_6
elif data_to_get == "valid":
    loader = valid_loader
else:
    raise ValueError ("Invalid data_to_get selected")

del test_loader
del train_loader_big
del train_loader_1
del train_loader_2
del train_loader_3
del train_loader_4
del train_loader_5
del train_loader_6
del valid_loader

for i, data_batch in enumerate(loader):
    bert_output = bert_model(data_batch['input_ids'].to(device), data_batch['attention_mask'].to(device))

    if 'X' not in globals():
        if pooled_or_unpooled == "pooled":
            X = bert_output['pooler_output']
        elif pooled_or_unpooled == "unpooled":
            X = bert_output['last_hidden_state']
        else:
            raise ValueError
    else:
        print("yei!")
        if pooled_or_unpooled == "pooled":
            X = torch.cat((X, bert_output['pooler_output']), 0)
        elif pooled_or_unpooled == "unpooled":
            X = torch.cat((X, bert_output['last_hidden_state']), 0)
        else:
            raise ValueError

    del bert_output
    torch.cuda.empty_cache()

    print("Batch #{} through!".format(i + 1))

    torch.save(X.half(), '/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_6/X_{}_{}_{}th4339.pt'.format(data_to_get, pooled_or_unpooled, i+1))
    del X
    torch.cuda.empty_cache()

RuntimeError: ignored

In [None]:
# torch.save(X, '/content/drive/MyDrive/MastersProject/BERT_outputs/X_{}_{}.pt'.format(data_to_get, pooled_or_unpooled))
# to load: X_train_pooled = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/X_train_pooled.pt', map_location=torch.device('cpu'))