In [1]:
#!pip install transformers

In [9]:
import pandas as pd
import numpy as np
import transformers
from transformers import BertPreTrainedModel, BertTokenizer, BertModel, BertConfig
import torch
from torch.utils.data import DataLoader,TensorDataset, RandomSampler, SequentialSampler

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')


I1005 15:36:00.003447 140317477144320 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/enlr/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [12]:
train_df =  pd.read_pickle('train_processed.pkl')
val_df =  pd.read_pickle('val_processed.pkl')

In [13]:
cls_id = tokenizer.convert_tokens_to_ids(tokenizer._cls_token)
sep_id = tokenizer.convert_tokens_to_ids(tokenizer._sep_token)


max_length = 512

def get_features_from_example(ex):
    input = ex.input.copy()
    pab = ex.pab_pos.copy()

    #add special tokens [CLS at beginning], [SEP at end], [optional SEP before pos]
    input = [cls_id]+input.tolist()+[sep_id]
    pab += 1
    
    #attention masking and padding
    mask = [1] * len(input)
    pad_length = max_length -len(input)
    #padding tokens and mask with 0
    input = input + [0]*pad_length
    mask = mask + [0]*pad_length

    assert len(input) == max_length
    assert len(mask) == max_length
    
    return input, mask, pab, int(ex.label)

In [6]:
def create_dataset(df):
    features = [get_features_from_example(df.iloc[i]) for i in range(len(df))]

    ids = torch.tensor([feature[0] for feature in features])
    masks = torch.tensor([feature[1] for feature in features])
    pabs = torch.tensor([feature[2] for feature in features])
    labels = torch.tensor([feature[3] for feature in features])

    print(ids.size(), masks.size(), pabs.size(), labels.size())

    return TensorDataset(ids, masks, pabs, labels)


In [11]:
train_dataset = create_dataset(train_df)

val_dataset = create_dataset(val_df)

torch.Size([2000, 512]) torch.Size([2000, 512]) torch.Size([2000, 3]) torch.Size([2000])
torch.Size([454, 512]) torch.Size([454, 512]) torch.Size([454, 3]) torch.Size([454])


In [129]:
r"""

Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
    **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
        Classification (or regression if config.num_labels==1) loss.
    **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
        list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
        of shape ``(batch_size, sequence_length, hidden_size)``:
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    **attentions**: (`optional`, returned when ``config.output_attentions=True``)
        list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids, labels=labels)
    loss, logits = outputs[:2]
"""
class BertForPronousResolution(BertPreTrainedModel):
    def __init__(self, config : BertConfig):
        super(BertForPronousResolution, self).__init__(config)
        
        self.bert = BertModel(config)
        
        #[P][A][B] classification layer
        self.classification = torch.nn.Linear(config.hidden_size * 3 , 3)
        
        self.init_weights()
    
    def forward(self, input_ids, attention_mask, pab, labels = None, token_type_ids = None ):
   
        output = self.bert(input_ids, attention_mask, token_type_ids, None, None)
        last_hidden_states = output[0]
        
        print(last_hidden_states[0], last_hidden_states[0][pab[0]])
        
        batches = last_hidden_states.size()[0]
        row_indexes = torch.arange(batches).unsqueeze(1) # row numbers in a column matrix
        pab_hidden_states = last_hidden_states[row_indexes, pab] #batch size x 3 x hidden size
        
        concatenated_states = pab_hidden_states.view(batches,-1)
        
        print(concatenated_states)
        logits = self.classification(concatenated_states)
        
        output = (logits,) + output[2:] #hidden states and attention if present
        
        if labels is not None:
            loss_fun = torch.nn.CrossEntropyLoss()
            loss = loss_fun(logits, labels)
            
            output = (loss,) + output
            
        return output

In [130]:
model = BertForPronousResolution.from_pretrained('bert-base-uncased')

I1005 16:40:02.835144 140317477144320 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/enlr/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1005 16:40:02.836277 140317477144320 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I1005 16:40:03.004810 140317477144320 modeli

In [131]:
test_data = train_dataset[0:1]
test_data[2:]

(tensor([[62, 42, 45]]), tensor([1]))

In [132]:
model(test_data[0],test_data[1], test_data[2],test_data[3])

tensor([[-0.8597, -0.3160, -0.1135,  ..., -0.1742,  0.5477,  0.5910],
        [-0.4190,  0.3307,  0.3428,  ...,  0.0560,  0.2804, -0.2568],
        [ 1.5684, -0.4883,  0.9586,  ...,  0.8987,  0.0998,  1.1028],
        ...,
        [-0.0892, -0.4428,  0.4997,  ...,  0.2574,  0.2867, -0.4949],
        [ 0.0533, -0.3233,  0.1011,  ...,  0.0119,  0.0863, -0.2838],
        [-0.2587, -0.4198,  0.3070,  ...,  0.1589,  0.2921, -0.5746]],
       grad_fn=<SelectBackward>) tensor([[-0.0562, -1.0524,  0.1111,  ..., -0.2489,  0.2950, -0.1346],
        [ 0.5295,  0.0873,  0.3644,  ..., -0.4559,  0.2341,  0.3187],
        [ 0.3718,  0.0329,  1.0530,  ..., -0.1534,  0.0422,  0.3036]],
       grad_fn=<IndexBackward>)
tensor([[-0.0562, -1.0524,  0.1111,  ..., -0.1534,  0.0422,  0.3036]],
       grad_fn=<ViewBackward>)


(tensor(0.7031, grad_fn=<NllLossBackward>),
 tensor([[-0.5949,  0.2181, -0.3326]], grad_fn=<AddmmBackward>))