In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from functools import partial
import nltk
from src.contextual_bart import ContextualisedBartModel,BartForContextualRecovery,SimplifiedBeamSearch
from src.dataset_processor import load_all_data
from src.utils import SmartCollator, get_args, setuptokenizer
from src.dataset_processor import (
    ContextGenerationDataset,
)
from transformers import BartTokenizer, BartConfig,BartForConditionalGeneration
from src.model_utils import CustomTrainer, get_training_arguments
import torch
from src.config import DATASET_PATH
from transformers.trainer_callback import EarlyStoppingCallback
import pickle as pk
import torch
from transformers import (    AutoTokenizer,
          AutoModelForSeq2SeqLM,
         LogitsProcessorList,    MinLengthLogitsProcessor, StoppingCriteriaList, MaxLengthCriteria,
         TopKLogitsWarper, TemperatureLogitsWarper,BeamSearchScorer,)

nltk.download("punkt")


def generate_tokenizer_and_data(args):

    # load the dataset

    train_data_packet = load_all_data(DATASET_PATH, mode="train")
    test_data_packet = load_all_data(DATASET_PATH, mode="dev")

    print(f"Training Data size: {len(train_data_packet)}")
    print(f"Training Data size: {len(test_data_packet)}")

    model_base = args.model_base
    tokenizer = setuptokenizer(
        model_base=model_base,
        special_tokens=[],
    )
    tokenizer.add_tokens(["[SEP]"])

    train_dataset = ContextGenerationDataset(
        tokenizer=tokenizer, nb_records=len(train_data_packet),
    )
    train_dataset.change_data_mode(1)
    train_dataset.set_record(train_data_packet)

    test_dataset = ContextGenerationDataset(
        tokenizer=tokenizer, nb_records=len(test_data_packet), 
    )
    test_dataset.change_data_mode(1)
    test_dataset.set_record(test_data_packet)

    return train_dataset, test_dataset, [train_data_packet,test_data_packet]



def model_init(
    vocab_size,
    context_delimiter_id,
    model_base="facebook/bart-base",
    use_random_restriction=False,
    section_prob=(0.25, 0.45),
    device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
):
    def build_model():
        bart_config = BartConfig.from_pretrained(model_base)
        bart_config.context_delimiter_id = context_delimiter_id
        bart_config.use_random_restriction = use_random_restriction
        bart_config.section_prob = section_prob

        generator = BartForContextualRecovery.from_pretrained(
            model_base, config=bart_config, ignore_mismatched_sizes=True
        )

        # update the tokens
        generator.resize_token_embeddings(vocab_size)  # type: ignore
        return generator.to(device)  # type: ignore
    return build_model


[nltk_data] Downloading package punkt to /home/nlplab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from dataclasses import dataclass
@dataclass
class Args:
    model_base: str
    
args = Args(model_base="facebook/bart-base")
train_dataset, test_dataset,[train_data_packet,test_data_packet] = generate_tokenizer_and_data(args)

processing files:  ['processed_data/context_generation_train.csv']
processing files:  ['processed_data/context_generation_dev.csv']
Training Data size: 145670
Training Data size: 12151


In [28]:
train_dataset[0].section_point

43

In [21]:
train_dataset[67].labels.shape,train_dataset[67].input_ids.shape,train_dataset[67].section_point

(torch.Size([83]), torch.Size([84]), 33)

In [22]:
train_dataset[67].input_ids

tensor([    0, 35396, 19336, 16441,  1253,   680, 17089, 31062, 27192,     6,
        18603,   661, 11710,  2083,  7864,     6,     8,   559, 34580,  6805,
        22327,   405,     4,    20,  9388,  1548,     6, 31217,    30,  9019,
         3634, 39321, 17589,  4936,  1343,     8,  1918,  2013,  2636,  2211,
            8,   617,  2226,    30,  1030, 20875, 15281,   234,  4781, 19898,
            6, 50265,  5087,  8490,    29,  3519,   223, 18668,  2301,    35,
            5,   588,    12,  8331,  1460,     7,  1760,     4,  1868,     5,
         9388,  1548,     8, 37958,  1809,  3951,  2031,    25,   402,    61,
          531,    28,  5032, 22241,     4,     2])

In [3]:

context_delimiter_id = train_dataset.tokenizer.get_added_vocab()['[SEP]']

train_model_path = "trained_models_mtl/bart_base_model_section_point/checkpoint-45525/pytorch_model.bin"

generator = model_init(len(train_dataset.tokenizer),
                       context_delimiter_id=context_delimiter_id,
                       model_base=args.model_base,use_random_restriction=False)()

state_dict = torch.load(train_model_path)
generator.load_state_dict(state_dict)

<All keys matched successfully>

In [4]:
dataset = ContextGenerationDataset(test_dataset.tokenizer,nb_records=1,use_random_restrictive=True)
dataset.change_data_mode(1)

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


["<s>Seven (stylized as Se7en) is a 1990 American crime thriller film directed by David Fincher and written by Andrew Kevin Walker. It stars Brad Pitt, Morgan Freeman, Gwyneth Paltrow, and [SEP] John C. McGinley. Set in a crime-ridden, unnamed city, Seven's plot follows disenchanted, near-retirement detective William Somerset (Freeman) and his new partner, the recently transferred David Mills (Pitt), as they attempt to stop a serial killer before he can complete a series of murders based on the seven deadly sins.</s>"]

In [6]:
class SimplifiedBeamSearch:
    def __init__(self, generator, tokenizer) -> None:
        self.generator = generator
        self.tokenizer = tokenizer

    def generate(
        self,
        input_ids,
        attention_mask,
        num_beams=5,
        min_length=100,
        max_length=500,
        top_k=50,
        temperature=0.85,
    ):

        # initialise decoder input_ids
        decoder_input_ids = torch.ones(
            (num_beams, 1), device=self.generator.device, dtype=torch.long
        )
        decoder_input_ids = (
            decoder_input_ids * self.generator.config.decoder_start_token_id
        )
        
        encoder_outputs = self.generator.get_encoder()(
                input_ids.repeat_interleave(num_beams, dim=0),
                attention_mask.repeat_interleave(num_beams, dim=0),
                return_dict=True,
            )
        #print(encoder_outputs[0].shape)

        model_kwargs = {
            "encoder_outputs": encoder_outputs
        }
        beam_scorer = BeamSearchScorer(
            batch_size=attention_mask.shape[0],
            num_beams=num_beams,
            device=self.generator.device,
        )

        logits_processor = LogitsProcessorList(
            [
                MinLengthLogitsProcessor(
                    1, eos_token_id=self.generator.config.eos_token_id
                )
            ]
        )
        logits_warper = LogitsProcessorList(
            [
                TopKLogitsWarper(top_k),
                TemperatureLogitsWarper(temperature),
            ]
        )

        outputs = self.generator.beam_sample(
            decoder_input_ids,
            beam_scorer,
            max_length=max_length,
            logits_processor=logits_processor,
            # stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length)),
            logits_warper=logits_warper,
            **model_kwargs,
        )

        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [36]:
from src.dataset_processor import ContextualGenerationData
from pytorch_lightning import seed_everything
data = ContextualGenerationData(input="""
                                Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects  they thought they had. Given that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a   or family friend to help them out. "Surgery ," one replied.
                                I was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job . One girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!" At that point, I was shocked. I am short, I can\'t deny that, but I don\'t think I would put myself through months of agony just to be a few centimetres taller.
                                """.replace("\n","").strip(),output="")

batch = dataset.procesTexts(data)
b_input_ids = batch.input_ids.view(1, -1).to(device)
b_input_mask = batch.attention_mask.view(1, -1).to(device)
batch.section_point, b_input_ids.shape

(104, torch.Size([1, 237]))

In [33]:
dataset.tokenizer.batch_decode(b_input_ids)

['<s>Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects they thought they had. Given that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a or family friend to help them out. "Surgery," one replied. I was pretty alarmed by that response. It seems that the graduates of [SEP] today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job. One girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!" At that point, I was shocked. I am short, I can\'t deny that, but I don\'t think I would put m

In [35]:
seed_everything(100)
bb= SimplifiedBeamSearch(generator,dataset.tokenizer)
bb.generate(input_ids=b_input_ids,
            attention_mask=b_input_mask,
            num_beams=10,
            max_length=270,
            temperature=0.99)

Global seed set to 100


torch.Size([10, 118, 768])


['Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects they thought they had. Given that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a family or family friend to help them out. "Surgery," one replied. It was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job. One girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!" At that point, I was shocked. I am short, I can\'t deny that, but I don\'t think I would put my

In [28]:
import torch
import numpy as np

In [56]:
def get_random_embedding_sections(batch_size, max_length, low=0.45, high=0.55):
    deletion_section_probs = np.random.uniform(size=(batch_size,), low=low, high=high)
    deletion_section = max_length * deletion_section_probs
    return torch.round(
        torch.FloatTensor(deletion_section),
    ).long()

In [64]:
get_random_embedding_sections(15,200,)

tensor([ 93, 100, 100, 109,  91,  93, 103,  91,  94, 101, 100,  91, 102, 110,
        102])

In [32]:
import logging
import math
import random
from dataclasses import dataclass
from logging import Logger
from typing import List, Optional, Tuple, Union

import torch
import torch.nn as nn
from transformers import (
    BeamSearchScorer,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
)
from transformers.models.bart.modeling_bart import (
    BartConfig,
    BartDecoder,
    BartEncoderLayer,
    BartLearnedPositionalEmbedding,
    BartPretrainedModel,
    BaseModelOutput,
    CrossEntropyLoss,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    _expand_mask,
    shift_tokens_right,
)

logger = logging.getLogger(__name__)


@dataclass
class EncoderOutputs(BaseModelOutput):
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    attention_mask: torch.LongTensor = None

In [64]:
class RestrictedBartEncoder(BartPretrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BartEncoderLayer`].

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id

        #self._context_delimiter_id = config.context_delimiter_id
        self._min_section_prob,self._max_section_prob = config.section_prob
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(
                config.vocab_size, embed_dim, self.padding_idx
            )

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        self.layers = nn.ModuleList(
            [BartEncoderLayer(config) for _ in range(config.encoder_layers)]
        )
        self.layernorm_embedding = nn.LayerNorm(embed_dim)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()
        
    
    def _get_random_embedding_sections(self,batch_size, max_length, low=0.20, high=0.6):
        deletion_section_probs = np.random.uniform(size=(batch_size,), low=low, high=high)
        deletion_section = max_length * deletion_section_probs
        return torch.round(
            torch.FloatTensor(deletion_section),
        ).long()
    def _strip_context(self, input_ids, embeddings, attention_mask):
        """

        :param input_ids:
        :param embeddings:
        :param attention_mask:
        :return:
        """
        # identify the locations of the context_delimiter in each of the input sequence
        if type(input_ids) is list:
            input_ids = torch.LongTensor(
                input_ids,
            )
            
        # Get the batch-size and the max_len of embeddings
        batch_size, batch_max_length,_ =  embeddings.shape
        
        #delimiter_points.nonzero(as_tuple=True)[-1]
        
        # Randomly select parts of the encoder output to 
        delimiter_points_idxs = self._get_random_embedding_sections(batch_size,
                                                                    batch_max_length,
                                                                    self._min_section_prob,
                                                                    self._max_section_prob)

        all_embeddings = []
        all_attention_masks = []
        all_input_ids = []
        max_length = 0
        embedding_dim = embeddings.shape[-1]

        # For item in input_ids, embeddings, attention_mask, input_ids, select the
        # portion of the tensor after the delimiter_point_id
        for delimiter_point_id, embedding, att_mask in zip(
            delimiter_points_idxs, embeddings, attention_mask
        ):
            embedding = embedding[delimiter_point_id + 1 :, :]
            if max_length < embedding.shape[0]:
                max_length = embedding.shape[0]
            all_embeddings.append(embedding)
            all_attention_masks.append(att_mask[delimiter_point_id + 1 :])

        # Reshape all the section of interest for each item in all_input_ids, all_embeddings, all_attention_masks to
        # the same size
        batch_embeddings: List = list()
        batch_attention_masks: List = list()

        for idx, (embedding, att_mask) in enumerate(
            zip(all_embeddings, all_attention_masks)
        ):
            len_diff = max_length - embedding.shape[0]
            if max_length > embedding.shape[0]:
                pad_tensor = torch.zeros(len_diff, embedding_dim).to(embedding.device)
                embedding = torch.concat([embedding, pad_tensor], dim=0)

                attn_pads = torch.zeros(
                    len_diff,
                ).to(att_mask.device)
                att_mask = torch.concat([att_mask, attn_pads], -1)

            batch_embeddings += [embedding.view(-1, max_length, embedding_dim)]
            batch_attention_masks += [att_mask.view(-1, max_length)]
        
        # Create the final tensors with the contexts removed
        batch_attention_masks = torch.concat(batch_attention_masks, 0)
        batch_embeddings = torch.concat(batch_embeddings, 0)
        return batch_embeddings, batch_attention_masks

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input = input_ids
            input_ids = input_ids.view(-1, input_ids.shape[-1])
        elif inputs_embeds is not None:
            input = inputs_embeds[:, :, -1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )

        attention_mask_ = attention_mask

        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask_ = _expand_mask(attention_mask, inputs_embeds.dtype)

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
            if head_mask.size()[0] != (len(self.layers)):
                raise ValueError(
                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
                    f" {head_mask.size()[0]}."
                )

        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (
                dropout_probability < self.layerdrop
            ):  # skip the layer
                layer_outputs = (None, None)
            else:
                if self.gradient_checkpointing and self.training:

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(encoder_layer),
                        hidden_states,
                        attention_mask_,
                        (head_mask[idx] if head_mask is not None else None),
                    )
                else:
                    layer_outputs = encoder_layer(
                        hidden_states,
                        attention_mask_,
                        layer_head_mask=(
                            head_mask[idx] if head_mask is not None else None
                        ),
                        output_attentions=output_attentions,
                    )

                hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        hidden_states, batch_encoder_attention_masks = self._strip_context(
            input_ids, hidden_states, attention_mask
        )

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    encoder_states,
                    all_attentions,
                    batch_encoder_attention_masks,
                ]
                if v is not None
            )

        return EncoderOutputs(
            last_hidden_state=hidden_states,
            hidden_states=encoder_states,
            attentions=all_attentions,
            attention_mask=batch_encoder_attention_masks,
        )

In [65]:
import copy
config = copy.deepcopy(generator.config)
config.section_prob = (0.2,0.65)
config.context_delimiter_id = generator.model.get_encoder()._context_delimiter_id

In [60]:
generator.model.get_encoder()._context_delimiter_id

50265

In [None]:
restrictive_encoder = RestrictedBartEncoder.from_pretrained("facebook/bart-base",config=config).to(device)

In [67]:
ouut = restrictive_encoder(b_input_ids.repeat_interleave(4, dim=0),b_input_mask.repeat_interleave(4, dim=0))

In [52]:
b_input_ids.shape

torch.Size([1, 121])

In [None]:
from datasets import load_dataset

dataset = load_dataset("race",'all')

In [11]:
cc=dataset['train'].features["article"]

In [23]:
dataset['train'][1]['article'].replace('\n',' ')

'Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects  they thought they had. Given that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a   or family friend to help them out. "Surgery ," one replied. I was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job . One girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!" At that point, I was shocked. I am short, I can\'t deny that, but I don\'t think I would put myself