In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import random
!pip install transformers
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from torch.nn import CrossEntropyLoss
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

Using TensorFlow backend.


Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
from evaluate import *

In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

In [0]:
# dict mapping of token to idx
vocab = load_pickle(path + 'vocab_full.pickle')
# dict mapping of docid to doc text
docid_to_text = load_pickle(path + 'label_ans.pickle')

# dict mapping of qid to question text
qid_to_text = load_pickle(path + 'qid_text.pickle')

train_qid_rel = load_pickle(path + "qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "qid_rel_valid.pickle")

train_set = load_pickle(path + 'data/data_train_50.pickle')
valid_set = load_pickle(path + 'data/data_valid_50.pickle')

test_set = load_pickle(path + 'data/data_test_500_rel.pickle')
test_set_full = load_pickle(path + 'data/data_test_500.pickle')

empty_docs = load_pickle(path+'empty_docs.pickle')

In [5]:
train_set = [x for x in train_set if x[1] not in empty_docs]
valid_set = [x for x in valid_set if x[1] not in empty_docs]

test_set = remove_empty(test_set)
test_set_full = remove_empty(test_set_full)

print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 283707
Number of validation samples: 31582
Number of test samples: 330


In [6]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [0]:
label_to_ans = load_pickle(path+"data-bert/label_to_ans.pickle")
qid_to_text = load_pickle(path+"data-bert/qid_to_text.pickle")

In [0]:
def add_question_token(q_tokens):
    c = ["[CLS]"]
    s = ["[SEP]"]
    q_tokens = c + q_tokens
    q_tokens = q_tokens + s

    return q_tokens

def add_ans_token(a_tokens):
    s = ["[SEP]"]
    a_tokens = a_tokens + s

    return a_tokens

def clip(lst):
    max_seq_len = 512
    if len(lst) > max_seq_len:
        lst = lst[:max_seq_len]
    else:
        lst = lst
    
    return lst

def get_input_ids(sequences, max_seq_len):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []

    for seq in sequences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Map tokens to their IDs.
        encoded_seq = tokenizer.convert_tokens_to_ids(seq)
        
        # Add the encoded sentence to the list.
        input_ids.append(encoded_seq)

    input_ids = pad_sequences(input_ids, maxlen=max_seq_len, dtype="long", 
                          value=0, truncating="post", padding="post")
    return input_ids

def get_att_mask(input_ids):
    # Create attention masks
    attention_masks = []

    # For each sentence...
    for sent in input_ids:
        
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        
        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)

    return attention_masks

In [0]:
def get_sequence_df(dataset):
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
    df_pos = df[['qid', 'pos']]
    df_pos = df_pos.rename(columns={'pos': 'docid'})
    df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
    df_pos = df_pos.drop_duplicates()

    df_neg = df[['qid', 'neg']]
    df_neg = df_neg.rename(columns={'neg': 'docid'})
    df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)
    data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

    data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
    data_df['ans_cand'] = data_df['docid'].apply(lambda x: label_to_ans[x])
    data_df['ques_token'] = data_df['question'].apply(lambda x: add_question_token(x))
    data_df['ans_cand'] = data_df['ans_cand'].apply(lambda x: add_ans_token(x))

    data_df = data_df[['qid', 'docid', 'label', 'ans_cand','ques_token']]
    data_df['seq'] = data_df['ques_token'] + data_df['ans_cand']

    data_df['seq_clipped'] = data_df['seq'].apply(clip)
    # train['len'] = train['seq_clipped'].apply(lambda x: len(x))

    return data_df

In [0]:
def get_pairwise_sequence_df(dataset):
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos_id', 2:'neg_id'})
    df['pos_label'] = df.apply(lambda x: 1, axis=1)
    df['neg_label'] = df.apply(lambda x: 0, axis=1)

    df['question'] = df['qid'].apply(lambda x: qid_to_text[x])
    df['pos_ans'] = df['pos_id'].apply(lambda x: label_to_ans[x])
    df['neg_ans'] = df['neg_id'].apply(lambda x: label_to_ans[x])

    df['ques_token'] = df['question'].apply(lambda x: add_question_token(x))
    df['pos_ans'] = df['pos_ans'].apply(lambda x: add_ans_token(x))
    df['neg_ans'] = df['neg_ans'].apply(lambda x: add_ans_token(x))

    df = df[['qid', 'pos_id', 'neg_id', 'pos_label', 'neg_label', 'pos_ans', 'neg_ans', 'ques_token']]
    df['pos_seq'] = df['ques_token'] + df['pos_ans']
    df['neg_seq'] = df['ques_token'] + df['neg_ans']

    df['pos_seq_clipped'] = df['pos_seq'].apply(clip)
    df['neg_seq_clipped'] = df['neg_seq'].apply(clip)

    return df

## **Pairwise**

In [0]:
trainset = get_pairwise_sequence_df(train_set)
validset = get_pairwise_sequence_df(valid_set)

# Get the lists of sentences and their labels.
train_pos_seq = trainset.pos_seq_clipped.values
train_neg_seq = trainset.neg_seq_clipped.values
train_pos_labels = trainset.pos_label.values
train_neg_labels = trainset.neg_label.values

valid_pos_seq = validset.pos_seq_clipped.values
valid_neg_seq = validset.neg_seq_clipped.values
valid_pos_labels = validset.pos_label.values
valid_neg_labels = validset.neg_label.values

print(len(train_pos_seq))
print(len(valid_pos_seq))

# train_pos_seq = train_pos_seq[:300]
# train_neg_seq = train_neg_seq[:300]
# train_pos_labels = train_pos_labels[:300]
# train_neg_labels = train_neg_labels[:300]

# valid_pos_seq = valid_pos_seq[:30]
# valid_neg_seq = valid_neg_seq[:30]
# valid_pos_labels = valid_pos_labels[:30]
# valid_neg_labels = valid_neg_labels[:30]

max_seq_len = 512

train_pos_input = get_input_ids(train_pos_seq, max_seq_len)
train_neg_input = get_input_ids(train_neg_seq, max_seq_len)
valid_pos_input = get_input_ids(valid_pos_seq, max_seq_len)
valid_neg_input = get_input_ids(valid_neg_seq, max_seq_len)

train_pos_mask = get_att_mask(train_pos_input)
train_neg_mask = get_att_mask(train_neg_input)
valid_pos_mask = get_att_mask(valid_pos_input)
valid_neg_mask = get_att_mask(valid_neg_input)

283707
31582


In [0]:
# save_pickle(path+'/data-bert/train_pos_labels.pickle', train_pos_labels)
# save_pickle(path+'/data-bert/train_neg_labels.pickle', train_neg_labels)
# save_pickle(path+'/data-bert/valid_pos_labels.pickle', valid_pos_labels)
# save_pickle(path+'/data-bert/valid_neg_labels.pickle', valid_neg_labels)

save_pickle(path+'/data-bert/train_pos_input_512.pickle', train_pos_input)
save_pickle(path+'/data-bert/train_neg_input_512.pickle', train_neg_input)
save_pickle(path+'/data-bert/valid_pos_input_512.pickle', valid_pos_input)
save_pickle(path+'/data-bert/valid_neg_input_512.pickle', valid_neg_input)

save_pickle(path+'/data-bert/train_pos_mask_512.pickle', train_pos_mask)
save_pickle(path+'/data-bert/train_neg_mask_512.pickle', train_neg_mask)
save_pickle(path+'/data-bert/valid_pos_mask_512.pickle', valid_pos_mask)
save_pickle(path+'/data-bert/valid_neg_mask_512.pickle', valid_neg_mask)

In [0]:
train_pos_labels = load_pickle(path+'/data-bert/train_pos_labels.pickle')
train_neg_labels = load_pickle(path+'/data-bert/train_neg_labels.pickle')
valid_pos_labels = load_pickle(path+'/data-bert/valid_pos_labels.pickle')
valid_neg_labels = load_pickle(path+'/data-bert/valid_neg_labels.pickle')

train_pos_input = load_pickle(path+'/data-bert/train_pos_input_512.pickle')
train_neg_input = load_pickle(path+'/data-bert/train_neg_input_512.pickle')
valid_pos_input = load_pickle(path+'/data-bert/valid_pos_input_512.pickle')
valid_neg_input = load_pickle(path+'/data-bert/valid_neg_input_512.pickle')

train_pos_mask = load_pickle(path+'/data-bert/train_pos_mask_512.pickle')
train_neg_mask = load_pickle(path+'/data-bert/train_neg_mask_512.pickle')
valid_pos_mask = load_pickle(path+'/data-bert/valid_pos_mask_512.pickle')
valid_neg_mask = load_pickle(path+'/data-bert/valid_neg_mask_512.pickle')

In [0]:
train_pos_inputs = torch.tensor(train_pos_input)
train_neg_inputs = torch.tensor(train_neg_input)
valid_pos_inputs = torch.tensor(valid_pos_input)
valid_neg_inputs = torch.tensor(valid_neg_input)

train_pos_labels = torch.tensor(train_pos_labels)
train_neg_labels = torch.tensor(train_neg_labels)
valid_pos_labels = torch.tensor(valid_pos_labels)
valid_neg_labels = torch.tensor(valid_neg_labels)

train_pos_masks = torch.tensor(train_pos_mask)
train_neg_masks = torch.tensor(train_neg_mask)
valid_pos_masks = torch.tensor(valid_pos_mask)
valid_neg_masks = torch.tensor(valid_neg_mask)

In [0]:
print(len(train_pos_inputs))
print(len(valid_pos_inputs))

283707
31582


In [0]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_pos_inputs, train_pos_masks, train_pos_labels, train_neg_inputs, train_neg_masks, train_neg_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(valid_pos_inputs, valid_pos_masks, valid_pos_labels, valid_neg_inputs, valid_neg_masks, valid_neg_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
print(len(train_dataloader))
print(len(validation_dataloader))

35464
3948


In [0]:
import torch.nn as nn

class BertPairwiseClassifier(nn.Module):
    def __init__(self, bert):
        
        super().__init__()

        self.config = BertConfig()
        self.num_labels = self.config.num_labels
        self.bert = bert
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [0]:
bert = BertModel.from_pretrained('bert-base-uncased')

model = BertPairwiseClassifier(bert)

# Tell pytorch to run this model on the GPU.
model.to(device)

BertPairwiseClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [0]:
def pairwise_loss(pos_scores, neg_scores):

    cross_entropy_loss = -torch.log(pos_scores) - torch.log(1 - neg_scores)

    margin = 0.2

    hinge_loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), margin - pos_scores + neg_scores)

    loss = (0.5 * cross_entropy_loss + 0.5 * hinge_loss)

    return loss

In [0]:
def train_pairwise(model, train_dataloader, optimizer):

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # Reset the total loss for this epoch.
    total_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        pos_input = batch[0].to(device)
        pos_mask = batch[1].to(device)
        pos_labels = batch[2].to(device)

        neg_input = batch[3].to(device)
        neg_mask = batch[4].to(device)
        neg_labels = batch[5].to(device)

        model.zero_grad()        

        pos_scores = torch.sigmoid(model(pos_input, token_type_ids=None, attention_mask=pos_mask, labels=pos_labels))[:,1]
        neg_scores = torch.sigmoid(model(neg_input, token_type_ids=None, attention_mask=neg_mask, labels=neg_labels))[:,1]

        loss = pairwise_loss(pos_scores, neg_scores).mean()
        
        # Track the number of batches
        nb_eval_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    return avg_train_loss

In [0]:
def validate_pairwise(model, validation_dataloader):

    model.eval()

    # Tracking variables 
    total_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        pos_input, pos_mask, pos_labels, neg_input, neg_mask, neg_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            pos_scores = torch.sigmoid(model(pos_input, token_type_ids=None, attention_mask=pos_mask, labels=pos_labels))[:,1]
            neg_scores = torch.sigmoid(model(neg_input, token_type_ids=None, attention_mask=neg_mask, labels=neg_labels))[:,1]

        loss = pairwise_loss(pos_scores, neg_scores).mean()

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss

In [0]:
optimizer = AdamW(model.parameters(), lr=0.001)

# Lowest validation lost
best_valid_loss = float('inf')

n_epochs = 2

for epoch in range(n_epochs):

    # Evaluate training loss
    train_loss = train_pairwise(model, train_dataloader, optimizer)
    # Evaluate validation loss
    valid_loss = validate_pairwise(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_model-bert-pairwise.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {}".format(round(train_loss, 3)))
    print("\t Validation Loss: {}\n".format(round(valid_loss, 3)))

  2%|▏         | 616/35464 [08:58<8:27:20,  1.14it/s]

In [0]:
torch.save(model.state_dict(), path + 'model/2_model-bert-pairwise.pt')

In [0]:
print('Memory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Memory Usage:
Allocated: 15.1 GB
Cached:    15.2 GB


## **Pointwise**

In [11]:
trainset = get_sequence_df(train_set)
validset = get_sequence_df(valid_set)

# Get the lists of sentences and their labels.
train_sequences = trainset.seq_clipped.values
train_labels = trainset.label.values

valid_sequences = validset.seq_clipped.values
valid_labels = validset.label.values

print(len(train_sequences))
print(len(valid_sequences))

train_sequences = train_sequences[:3000]
train_labels = train_labels[:3000]

valid_sequences = valid_sequences[:300]
valid_labels = valid_labels[:300]

max_seq_len = 512

train_input = get_input_ids(train_sequences, max_seq_len)
valid_input = get_input_ids(valid_sequences, max_seq_len)

train_att_mask = get_att_mask(train_input)
valid_att_mask = get_att_mask(valid_input)

298401
33143


In [0]:
# # train_labels = trainset.label.values
# # valid_labels = validset.label.values

# # save_pickle(path+'/data-bert/train_labels.pickle', train_labels)
# # save_pickle(path+'/data-bert/valid_labels.pickle', valid_labels)

# save_pickle(path+'/data-bert/train_input_512.pickle', train_input)
# save_pickle(path+'/data-bert/valid_input_512.pickle', valid_input)
# save_pickle(path+'/data-bert/train_mask_512.pickle', train_att_mask)
# save_pickle(path+'/data-bert/valid_mask_512.pickle', valid_att_mask)

In [0]:
# train_input = load_pickle(path+'/data-bert/train_input.pickle')
# valid_input = load_pickle(path+'/data-bert/valid_input.pickle')
# train_att_mask = load_pickle(path+'/data-bert/train_mask.pickle')
# valid_att_mask = load_pickle(path+'/data-bert/valid_mask.pickle')

train_input = load_pickle(path+'/data-bert/train_input_512.pickle')
valid_input = load_pickle(path+'/data-bert/valid_input_512.pickle')
train_att_mask = load_pickle(path+'/data-bert/train_mask_512.pickle')
valid_att_mask = load_pickle(path+'/data-bert/valid_mask_512.pickle')

train_labels = load_pickle(path+'/data-bert/train_labels.pickle')
valid_labels = load_pickle(path+'/data-bert/valid_labels.pickle')

In [0]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(valid_labels)

train_masks = torch.tensor(train_att_mask)
validation_masks = torch.tensor(valid_att_mask)

In [13]:
print(len(train_input))
print(len(valid_input))

3000
300


In [0]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [19]:
print(len(train_dataloader))
print(len(validation_dataloader))

375
38


In [0]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Model**

In [0]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, bert):
        
        super().__init__()

        self.config = BertConfig.from_pretrained("/content/drive/My Drive/FiQA/bert-lm/test_lm/config.json")
        # self.config = BertConfig()
        self.num_labels = self.config.num_labels
        self.bert = bert
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [17]:
bert = BertModel.from_pretrained('bert-base-uncased')

# model_path = "/content/drive/My Drive/FiQA/bert-lm/test_lm/"
# bert = BertModel.from_pretrained(model_path)

model = BertClassifier(bert)

# Tell pytorch to run this model on the GPU.
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [22]:
# bert = BertModel.from_pretrained('bert-base-uncased')

model_path = "/content/drive/My Drive/FiQA/bert-lm/test_lm/"
bert = BertModel.from_pretrained(model_path)

model = BertClassifier(bert)

# Tell pytorch to run this model on the GPU.
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28989, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [23]:
for step, batch in enumerate(tqdm(train_dataloader)):

    b_input_ids = batch[0].to(device)


100%|██████████| 375/375 [00:00<00:00, 4379.81it/s]


In [0]:
def train(model, train_dataloader, optimizer):

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # Reset the total loss for this epoch.
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    acc = eval_accuracy/nb_eval_steps

    return avg_train_loss, acc

In [0]:
def validate(model, validation_dataloader):

    model.eval()

    # Tracking variables 
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,
                            labels=b_labels)
        
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss, acc

In [0]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=0.001)

In [28]:
# Lowest validation lost
best_valid_loss = float('inf')

n_epochs = 1

for epoch in range(n_epochs):

    # Evaluate training loss
    train_loss, train_acc = train(model, train_dataloader, optimizer)
    
    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_model-lm-test-bert-512.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

  0%|          | 0/375 [00:00<?, ?it/s]


RuntimeError: ignored

In [0]:
torch.save(model.state_dict(), path + 'model/2_model-bert-512.pt')

In [0]:
print(torch.cuda.get_device_name(0))
print('Memory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.4 GB
Cached:    0.5 GB


In [0]:
for row in test_set:
    row[2] = [x for x in row[2] if x is not 0]

for row in test_set_full:
    row[2] = [x for x in row[2] if x is not 0]

In [0]:
# test_df = pd.DataFrame(test_set_full)
# test_df = test_df.rename(columns={0: 'qid', 1: 'pos', 2:'cand'})
# # test_df = test_df[['qid', 'cand']]

# test_df.head(5)

# test_pos = test_df[['qid', 'pos']]
# test_pos = test_pos.explode('pos')
# test_pos = test_pos.rename(columns={'pos': 'docid'})
# test_pos['label'] = test_pos.apply(lambda x: 1, axis=1)

# len(test_pos)

# test_neg = test_df[['qid', 'cand']]
# test_neg = test_neg.explode('cand')
# test_neg = test_neg.rename(columns={'cand': 'docid'})
# test_neg ['label'] = test_neg .apply(lambda x: 0, axis=1)

# test_neg.head(5)

# test_data = pd.concat([test_pos, test_neg]).sort_values(by=['qid'])

# test_data['question'] = test_data['qid'].apply(lambda x: qid_to_text[x])
# test_data['ans_cand'] = test_data['docid'].apply(lambda x: label_to_ans[x])
# test_data['ques_token'] = test_data['question'].apply(lambda x: add_question_token(x))
# test_data['ans_cand'] = test_data['ans_cand'].apply(lambda x: add_ans_token(x))

# test_data = test_data[['qid', 'docid', 'label', 'ans_cand','ques_token']]
# test_data['seq'] = test_data['ques_token'] + test_data['ans_cand']
# test_data['seq_clipped'] = test_data['seq'].apply(clip)

# test_data.head(5)

# docid_map = test_data[['docid', 'seq_clipped']]
# test_full_docid_to_seq = {}

# for index, row in docid_map.iterrows():
#     test_full_docid_to_seq[row['docid']] = row['seq_clipped']

# print(take(5, test_full_docid_to_seq.items()))

# save_pickle(path+'data-bert/test_full_docid_to_seq.pickle', test_full_docid_to_seq)

In [0]:
# test_docid_to_seq = load_pickle(path+'data-bert/test_docid_to_seq.pickle')
test_docid_to_seq = load_pickle(path+'data-bert/test_full_docid_to_seq.pickle')

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):

    qid_pred_rank = {}

    model.eval()

    for i, seq in enumerate(tqdm(test_set)):
        
        qid, label, cands = seq[0], seq[1], seq[2]

        cands_id = np.array(cands)

        scores = []

        for docid in cands:

            seq_text = test_docid_to_seq[docid]

            encoded_seq = tokenizer.convert_tokens_to_ids(seq_text)

            input_ids = pad_seq(encoded_seq, max_seq_len)

            att_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids]]).to(device)
            
            input_ids = torch.tensor([input_ids]).to(device)

            with torch.no_grad():
            # Forward pass, calculate logit predictions
                outputs = model(input_ids, token_type_ids=None, attention_mask=att_mask)

            logits = outputs[0]

            pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            scores.append(pred[:,1][0])

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank
    # MRR, average_ndcg, precision = evaluate(qid_pred_rank, qid_rel, k)

    # return qid_pred_rank, MRR, average_ndcg, precision

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 10))
toy_test = test_set[:10]

In [0]:
model.load_state_dict(torch.load(path+'model/2_model-bert-full.pt'))

qid_pred_rank = get_rank(model, test_set_full, test_qid_rel, max_seq_len=256)

100%|██████████| 330/330 [40:27<00:00,  7.35s/it]


In [0]:
k = 10

num_q = len(toy_test)

MRR, average_ndcg, precision = evaluate(qid_pred_rank, test_qid_rel, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 10 queries: 0.045931021765975764

MRR@10 for 10 queries: 0.04352209352209352

Average Precision@1: 0.04242424242424243


In [0]:
save_pickle(path+'rank/2_bert_test_full.pickle', qid_pred_rank)