# Training With MLM and NSP

In [32]:
from transformers import BertTokenizer, BertForPreTraining, AdamW
import torch
import pandas as pd
import random
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
max_seq_length = 128

## Train Dataset

In [22]:
# Normalize Train Dataset and Discretize into range [1,1000]
# to make compatible with BERT

df_train = pd.read_csv('./data_matrix.csv', header=0)
scaler = preprocessing.MinMaxScaler()
names = df_train.columns

d_train = scaler.fit_transform(df_train)
d_rounded_train = np.round(d_train*1000)
df1_train = pd.DataFrame(d_rounded_train, columns=names)

## reverse
# d_scaled_train = df1_train/1000
# d_reversed_train = scaler.inverse_transform(d_scaled_train)

In [23]:
# 50/50 NSP Training Data

df2_train = df1_train.sample(frac=0.5)
df1_train.drop(df2_train.index.to_list(), inplace=True)

df1_train['label'] = 0
df2_train['label'] = 1

df2_train[['discover_v', 'discover_d', 'discover_t']] = \
    df2_train[['discover_v', 'discover_d', 'discover_t']].sample(frac=1).values

df1_train = df1_train.append(df2_train).sample(frac=1).reset_index(0, drop=True)

df1_train.to_csv('output_train.csv', index=None)

## Test Dataset

In [None]:
# Normalize TestDataset and Discretize into range [1,1000]
# to make compatible with BERT

df_test = pd.read_csv('./data_test.csv', header=0)
scaler = preprocessing.MinMaxScaler()
names = df_test.columns

d_test = scaler.fit_transform(df_test)
d_rounded_test = np.round(d_test*1000)
df1_test = pd.DataFrame(d_rounded_test, columns=names)

## reverse
d_scaled_test = df1_test/1000
d_reversed_test = scaler.inverse_transform(d_scaled_test)

In [None]:
# 50/50 NSP Test Data

df2_test = df1_test.sample(frac=0.5)
df1_test.drop(df2_test.index.to_list(), inplace=True)

df1_test['label'] = 0
df2_test['label'] = 1

df2_test[['discover_v', 'discover_d', 'discover_t']] = \
    df2_test[['discover_v', 'discover_d', 'discover_t']].sample(frac=1).values

df1_test = df1_test.append(df2_test).sample(frac=1).reset_index(0, drop=True)

df1_test.to_csv('output_train.csv', index=None)

## Custom Tokenizer

In [24]:
labels = []
input_ids = []
token_type_ids = []
attention_mask = []
tokens = []
segment_ids = []

## Examples: Dict in the form {'seq_wind', 'seq_discover', 'label'}
# 'seq_wind': [(first_triplet), (second_triplet), (third_triplet)]
# 'seq_discover': [(first_triplet), (second_triplet), (third_triplet)]
# 'label': [(0,0,0), (1,1,1), (0,0,0)]

def custom_tokenizer(examples):

    for key in examples.keys():

        if key == "label":
            for label in examples[key]:
                labels.append(label)

        elif key == "seq_wind":
            tokens.append(1001)  ## CLS = 1001
            segment_ids.append(0)
            for triplet in examples[key]:
                for el in triplet:
                    tokens.append(int(el))
                    segment_ids.append(0)
            tokens.append(1002)  ## SEP = 1002
            segment_ids.append(0)

        elif key == "seq_discover":
            for triplet in examples[key]:
                for el in triplet:
                    tokens.append(int(el))
                    segment_ids.append(1)
            tokens.append(1002) ## SEP = 1002
            segment_ids.append(1)

    input_mask = [1] * len(tokens)

    while len(tokens) < max_seq_length:
        tokens.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    input_ids.append(tokens)
    token_type_ids.append(segment_ids)
    attention_mask.append(input_mask)

    items = {'input_ids': torch.tensor(input_ids, dtype=torch.int64)}
    items['token_type_ids'] = torch.tensor(token_type_ids, dtype=torch.int64)
    items['attention_mask'] = torch.tensor(attention_mask, dtype=torch.float)

    return items

In [25]:
tr_dataset = {}
tr_dataset['seq_wind'] = list(df1_train[['wind_v','wind_d','wind_t']].values.tolist())
tr_dataset['seq_discover'] = list(df1_train[['discover_v','discover_d','discover_t']].values.tolist())
tr_dataset['label'] = list(df1_train[['label']].values.tolist())

train_inputs = custom_tokenizer(tr_dataset)
train_inputs['next_sentence_label'] = torch.LongTensor([labels]).T.squeeze()
train_inputs['labels'] = train_inputs['input_ids'].detach().clone()


## Masking tokens in the input_ids tensor using the 15% probability for MLM
rand = torch.rand(train_inputs['input_ids'].shape)
mask_arr = (rand < 0.15) * (train_inputs['input_ids'] != 1001) * \
           (train_inputs['input_ids'] != 1002) * (train_inputs['input_ids'] != 0)

selection = []
for i in range(train_inputs['input_ids'].shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

for i in range(train_inputs['input_ids'].shape[0]):
    train_inputs['input_ids'][i, selection[i]] = 1003

In [None]:
te_dataset = {}
te_dataset['seq_wind'] = list(df1_test[['wind_v','wind_d','wind_t']].values.tolist())
te_dataset['seq_discover'] = list(df1_test[['discover_v','discover_d','discover_t']].values.tolist())
te_dataset['label'] = list(df1_test[['label']].values.tolist())

test_inputs = custom_tokenizer(tr_dataset)
test_inputs['next_sentence_label'] = torch.LongTensor([labels]).T.squeeze()
test_inputs['labels'] = test_inputs['input_ids'].detach().clone()


## Masking tokens in the input_ids tensor using the 15% probability for MLM
rand = torch.rand(test_inputs['input_ids'].shape)
mask_arr = (rand < 0.15) * (test_inputs['input_ids'] != 1001) * \
           (test_inputs['input_ids'] != 1002) * (test_inputs['input_ids'] != 0)

selection = []
for i in range(test_inputs['input_ids'].shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

for i in range(test_inputs['input_ids'].shape[0]):
    test_inputs['input_ids'][i, selection[i]] = 1003

## Create Datasets

In [26]:
class NASADataset(torch.utils.data.Dataset):

    def __init__(self, encodings, batch_size=510):
        self.counter = 0
        self.size = int(np.floor(encodings['input_ids'].shape[1])/510)
        self.encodings = {}
        
        for key in encodings.keys():
            if key != 'next_sentence_label':
                self.encodings[key] = encodings[key][0][:int(np.floor(encodings[key].shape[1])/510)*510].reshape(self.size,510)
            else:
                self.encodings[key] = encodings[key]

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

In [27]:
train_dataset = NASADataset(train_inputs)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)

# test_dataset = NASADataset(test_inputs)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

## BERT Model 

In [35]:
model = BertForPreTraining.from_pretrained('bert-base-uncased').to(device)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm.notebook import tqdm 

epochs = 100
train_loss = []
test_loss = []
for epoch in range(epochs):

    train_loop = tqdm(train_loader, leave=True)
    # test_loop = tqdm(test_loader, leave=True)

    for batch in train_loop:

        train_temp_loss = 0
        test_temp_loss = 0

        optim.zero_grad()

        input_ids_batch = batch['input_ids'].to(device)
        token_type_ids_batch = batch['token_type_ids'].to(device)
        attention_mask_batch = batch['attention_mask'].to(device)
        next_sentence_label_batch = batch['next_sentence_label'].to(device)
        labels_batch = batch['labels'].to(device)

        # ---------- FORWARD ----------
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch,
                        token_type_ids=token_type_ids_batch,
                        next_sentence_label=next_sentence_label_batch,
                        labels=labels_batch)
        loss = outputs.loss

        # ---------- BACKWARD ----------
        loss.backward()
        optim.step()

        # ---------- LOG ----------
        train_loop.set_description(f'Epoch {epoch+1}')
        train_loop.set_postfix(loss=loss.item())

        train_temp_loss += loss.item()


    # ## Evaluate on Test Set
    # with torch.no_grad():

    #     for batch in test_loop:

    #         input_ids_batch = batch['input_ids'].to(device)
    #         token_type_ids_batch = batch['token_type_ids'].to(device)
    #         attention_mask_batch = batch['attention_mask'].to(device)
    #         next_sentence_label_batch = batch['next_sentence_label'].to(device)
    #         labels_batch = batch['labels'].to(device)

    #         outputs = model(input_ids_batch, attention_mask=attention_mask_batch,
    #                         token_type_ids=token_type_ids_batch,
            #                 next_sentence_label=next_sentence_label_batch,
            #                 labels=labels_batch)
            # test_inst_loss = outputs.loss

            # # ---------- LOG ----------
            # test_loop.set_description(f'Epoch {epoch+1}')
            # test_loop.set_postfix(loss=test_inst_loss.item())
            #test_temp_loss += test_inst_loss.item()

    train_loss.append(train_temp_loss)
    #test_loss.append(test_temp_loss)

plt.plot(train_loss)