
# Imports

In [1]:
from argparse import Namespace

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5Model
import pytorch_lightning as pl

from src.models.modeling_conll2003 import T5ForConll2003
from src.models.modeling_utils import ConfigBase
from src.models.modeling_ner import ModelForNERBase
from src.models.evaluate import get_entities_from_tokens
from src.data.make_conll2003 import get_example_sets

# Features

In [2]:
model = T5ForConll2003.from_pretrained('t5-small', hparams=None)

In [3]:
examples = model.get_examples()

In [4]:
ex = examples['train'][0]

In [5]:
ex

Source: Peter Blackburn
Target: Peter Blackburn <PER>

In [6]:
ENTITIES_TOKENS = [
            '<O>',
            '<PER>',
            '<ORG>',
            '<LOC>',
            '<MISC>',
            '<Ent>'
        ]

In [7]:
LABELS_TOKENS = [
    "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"
]

In [8]:
class InputFeatures:
    
    def __init__(self, input_ids, attention_mask, label_ids, prediction_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label_ids = label_ids
        self.prediction_mask = prediction_mask

In [9]:
def convert_example_to_features(example, tokenizer, max_length):
    source = example.source
    target = example.target
    
    source_tokens = tokenizer.tokenize(source)
    target_tokens = tokenizer.tokenize(target)
    
    entities = get_entities_from_tokens(target_tokens, tokenizer, ENTITIES_TOKENS, length=len(source_tokens))
        
    assert len(source_tokens) == len(entities), f'{example}'
    
    source_tokens = source_tokens[:min(len(source_tokens), max_length - 1)]
    entities = entities[:min(len(source_tokens), max_length - 1)]
    
    # attention and prediction mask
    attention_mask = [1] * len(source_tokens)
    
    prediction_mask = []
    for word in source.split(' '):
        tokens = tokenizer.tokenize(word)
        prediction_mask += [1] + [0] * (len(tokens) - 1)
        
    prediction_mask = prediction_mask[:len(entities)]
        
    assert len(prediction_mask) == len(entities)
        
    # eos
    source_tokens += [tokenizer.eos_token]
    entities += [tokenizer.eos_token]
    attention_mask += [1]
    prediction_mask += [0]
    
    # padding
    missing = max(0, max_length - len(source_tokens))
    source_tokens += [tokenizer.pad_token] * missing
    attention_mask += [0] * missing
    prediction_mask += [0] * missing
    entities += [tokenizer.pad_token] * missing
    
    # to ids
    input_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    
    # label_ids
    entities2ids = {k:i for i, k in enumerate(LABELS_TOKENS)}
    label_ids = [entities2ids.get(ent, -100) for ent in entities]
    
    assert len(input_ids) == len(attention_mask) == len(label_ids) == len(prediction_mask) == max_length
    
    return InputFeatures(input_ids, attention_mask, label_ids, prediction_mask)

In [10]:
feature = convert_example_to_features(ex, model.tokenizer, 320)

In [11]:
ex

Source: Peter Blackburn
Target: Peter Blackburn <PER>

In [12]:
class NERDataset(Dataset):
    
    def __init__(self, features):
        self.features = features
        
    def __len__(self,):
        return len(self.features)
    
    def __getitem__(self, idx):
        feat = self.features[idx]
        input_ids = torch.tensor(feat.input_ids, dtype=torch.long)
        attention_mask = torch.tensor(feat.attention_mask, dtype=torch.long)
        label_ids = torch.tensor(feat.label_ids, dtype=torch.long)
        prediction_mask = torch.tensor(feat.prediction_mask, dtype=torch.long)
        return input_ids, attention_mask, label_ids, prediction_mask

# New Base

In [13]:
class NewBase(ConfigBase):
    
    def _construct_examples_kwargs(self,):
        kwargs = {}
        return kwargs
    
    def get_features(self, examples):
        kwargs = {
            'max_length': self.max_length,
        }
        return [convert_example_to_features(ex, self.tokenizer, **kwargs) for ex in examples]
    
    def prepare_data(self,):
        if not self._has_cached_datasets():
            examples = self.get_examples()
            train_features = self.get_features(examples['train'])
            val_features = self.get_features(examples['valid'])
            test_features = self.get_features(examples['test'])
            
            self.train_dataset = NERDataset(train_features)
            self.val_dataset = NERDataset(val_features)
            self.test_dataset = NERDataset(test_features)

# New NER Base

In [14]:
class NewNERBase(ModelForNERBase, NewBase):
    
    
    def to_entities(self, samples):
        return [
            LABELS_TOKENS[y] for y in samples 
        ]
    
    def _handle_batch(self, batch):
        input_ids, attention_mask, label_ids, prediction_mask = batch
        outputs = self(input_ids=input_ids,
                       attention_mask=attention_mask,
                       label_ids=label_ids,
                       prediction_mask=prediction_mask)
        return outputs
    
    def _handle_eval_batch(self, batch):
        outputs = self._handle_batch(batch)
        active_labels = outputs[2]
        active_logits = outputs[1]
        
        predicted_entities = active_logits.argmax(-1).cpu().numpy().tolist()
        target_entities = active_labels.cpu().numpy().tolist()
        
        predicted_entities = self.to_entities(predicted_entities)
        target_entities = self.to_entities(target_entities)
        
        return outputs, target_entities, predicted_entities

# Modelo

In [15]:
class T5EncoderForNER(nn.Module):
    
    def __init__(self, pretrained_path, num_labels, weight_O=0.1):
        super().__init__()
        
        self.model = T5Model.from_pretrained(pretrained_path)
        self.num_labels = num_labels
        
        assert isinstance(weight_O, float) and 0 < weight_O < 1
        weights = [weight_O] + [1.] * (num_labels - 1)
        weights = torch.tensor(weights)
        self.loss_fct = torch.nn.CrossEntropyLoss()
        self.classifier = nn.Linear(self.model.config.hidden_size, num_labels)
        
    def active_logits_and_labels(self, logits, label_ids, prediction_mask=None):
        # take the active logits
        active_logits = logits.view(-1, self.num_labels)
        # take the active labels
        active_labels = label_ids.view(-1)
        if prediction_mask is not None:
            prediction_mask = prediction_mask.view(-1)
            active_logits = active_logits[prediction_mask == 1]
            active_labels = active_labels[prediction_mask == 1]
        return active_logits, active_labels
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        label_ids=None,
        prediction_mask=None,
    ):

        # Encode if needed (training, first prediction pass)
        encoder_outputs = self.model.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
            
        hidden_states = encoder_outputs[0]        
        logits = self.classifier(hidden_states)
        
        outputs = (logits,)

        if label_ids is not None:
            active_logits, active_labels = self.active_logits_and_labels(
                logits, label_ids, prediction_mask)
            # calc the loss
            loss = self.loss_fct(active_logits, active_labels)

            outputs = (loss, active_logits, active_labels) + outputs
            
        return outputs
        

In [16]:
class T5PL(T5EncoderForNER, pl.LightningModule):
    pass

In [17]:
class T5ForNER(NewNERBase, T5PL):

    def __init__(self, pretrained_path, num_labels, hparams):
        super(T5PL, self).__init__(pretrained_path, num_labels)

        self.hparams = hparams

        self.tokenizer = self.get_tokenizer()
        
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def get_tokenizer(self,):
        pretrained_model = self.get_value_or_default_hparam(
            'pretrained_model', 't5-small')
        return T5Tokenizer.from_pretrained(pretrained_model)

# New Conll2003

In [18]:
class NewConll2003Base:
    @property
    def entities_tokens(self):
        return [
            '<O>',
            '<PER>',
            '<ORG>',
            '<LOC>',
            '<MISC>',
            '<Ent>'
        ]

    @property
    def labels2words(self,):
        return {
            'O': '[Other]',
            'PER': '[Person]',
            'LOC': '[Local]',
            'MISC': '[Miscellaneous]',
            'ORG': '[Organization]',
            'Ent': '[Ent]'
        }

    @property
    def entities2tokens(self,):
        return{
            '[Other]': '<O>',
            '[Person]': '<PER>',
            '[Local]': '<LOC>',
            '[Miscellaneous]': '<MISC>',
            '[Organization]': '<ORG>',
            '[Ent]': '<Ent>'
        }
    
    def get_tokenizer(self,):
        tokenizer = super().get_tokenizer()
        tokenizer.add_tokens(self.entities_tokens)
        return tokenizer
    
    def get_examples(self,):
        kwargs = self._construct_examples_kwargs()
        return get_example_sets(self.datapath, **kwargs)

In [19]:
class NewT5ForConll2003(NewConll2003Base, T5ForNER):
    pass

In [20]:
hparams = {
    "max_length": 320
}
hparams = Namespace(**hparams)

In [25]:
model = NewT5ForConll2003('t5-small', num_labels=9, hparams=hparams)
# model = T5EncoderForNER('t5-small', num_labels=9)

In [26]:
input_ids = torch.arange(32).unsqueeze(0)
input_ids.shape

torch.Size([1, 32])

In [27]:
logits = model(input_ids)[0]

True


# Training

In [29]:
trainer = pl.Trainer(fast_dev_run=True)

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: False
No environment variable for node rank defined. Set as 0.


In [30]:
trainer.fit(model)


    | Name                                                                  | Type                  | Params
------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5Model               | 60 M  
1   | model.shared                                                          | Embedding             | 16 M  
2   | model.encoder                                                         | T5Stack               | 35 M  
3   | model.encoder.block                                                   | ModuleList            | 18 M  
4   | model.encoder.block.0                                                 | T5Block               | 3 M   
5   | model.encoder.block.0.layer                                           | ModuleList            | 3 M   
6   | model.encoder.block.0.layer.0                                         | T5LayerSelfAttention  | 1 M   
7   | model.encode

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

True
4


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

False
4



1

In [195]:
model.prepare_data()

In [196]:
batch = next(iter(model.test_dataloader()))

In [200]:
outputs = model._handle_batch(batch)

4


In [201]:
outputs[0]

tensor(2.1361)