In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from functools import partial
import nltk
from src.contextual_bart import ContextualisedBartModel,BartForContextualRecovery,SimplifiedBeamSearch
from src.dataset_processor import load_all_data
from src.utils import SmartCollator, get_args, setuptokenizer
from src.dataset_processor import (
    ContextGenerationDataset,
)
from transformers import BartTokenizer, BartConfig,BartForConditionalGeneration
from src.model_utils import CustomTrainer, get_training_arguments
import torch
from src.config import DATASET_PATH
from transformers.trainer_callback import EarlyStoppingCallback
import pickle as pk
import torch
from transformers import (    AutoTokenizer,
          AutoModelForSeq2SeqLM,
         LogitsProcessorList,    MinLengthLogitsProcessor, StoppingCriteriaList, MaxLengthCriteria,
         TopKLogitsWarper, TemperatureLogitsWarper,BeamSearchScorer,)

nltk.download("punkt")

DATASET_PATH = "summarisation_data/"

def generate_data():

    # load the dataset

    train_data_packet = load_all_data(DATASET_PATH, mode="train")
    dev_data_packet = load_all_data(DATASET_PATH, mode="dev")
    test_data_packet = load_all_data(DATASET_PATH,mode="test")

    print(f"Training Data size: {len(train_data_packet)}")
    print(f"Training Data size: {len(test_data_packet)}")
    return train_data_packet,dev_data_packet,test_data_packet

[nltk_data] Downloading package punkt to /home/nlplab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
train_data_packet,dev_data_packet,test_data_packet = generate_data()

processing files:  ['summarisation_data/xsum_train.csv']
processing files:  ['summarisation_data/xsum_dev.csv']
processing files:  ['summarisation_data/xsum_test.csv']
Training Data size: 203083
Training Data size: 11322


In [6]:
train_data_packet[0]

ContextualGenerationData(input='McGeady will be reunited with new Black Cats boss Simon Grayson, for whom he played 35 games, scoring eight times, at Preston North End last season. The 31-year-old, who began his career at Celtic, joined Everton from Spartak Moscow in January 2014 and subsequently played 41 games, scoring once. He has also made 90 appearances for the Republic of Ireland. Find all the latest football transfers on our dedicated page.', output='Everton winger Aiden McGeady is to join Championship side Sunderland on a permanent deal.')

In [2]:
extreme_boundary = (0.7,0.9)
moderate_boundary  = (0.45,0.7)
simple_boundary  = (0.2,0.45)

In [3]:
import copy
from src.dataset_processor import ContextualGenerationData
import numpy as np
def extendData(data: ContextualGenerationData):
    def get_section_bounday(section_boundary,nb_words):
        section_point = round(
            (
                np.random.uniform(
                    size=(1,),
                    low=section_boundary[0],
                    high=section_boundary[1],
                )
                * nb_words
            )[0]
        )
        return section_point
    
    passage = data.input
    clean_passage = " ".join(passage.replace("[SEP]", 
                                             "").strip().split()).strip()
    passage_sentence_tokenized = clean_passage.strip().split()
    nb_words = len(passage_sentence_tokenized)
    
    extreme_section = get_section_bounday(extreme_boundary,nb_words)
    moderate_section = get_section_bounday(moderate_boundary,nb_words)
    simple_section = get_section_bounday(simple_boundary,nb_words)
    
    print(extreme_section,moderate_section,simple_section)
    
    repeated_data =[copy.deepcopy(data),copy.deepcopy(data),copy.deepcopy(data)]
    repeated_data[0].boundary = simple_section
    repeated_data[1].boundary = moderate_section
    repeated_data[2].boundary = extreme_section
    
    return repeated_data

In [6]:
cc=extendData(train_data_packet[0])

209 165 66


In [16]:
pp=nltk.tokenize.sent_tokenize(cc[0].input)

In [7]:
from dataclasses import dataclass

import random


In [8]:
from nltk.util import ngrams
def expand_data_ngram(data: ContextualGenerationData,n_count= 2):
    input_text =  data.input
    sentences =  nltk.tokenize.sent_tokenize(input_text)
    
    unigram_texts = random.sample(sentences,random.choice([2,3,3,2]))
    
    # create the bigram texts 
    bi_grams = random.sample(list(ngrams(sentences,2)),2)
    
    # trigram texts
    tri_grams = random.sample(list(ngrams(sentences,3)),2)
    
    # nanogram texts
    nano_grams = random.sample(list(ngrams(sentences,4)),random.choice([2,1,2]))
    
    examples = unigram_texts+bi_grams+tri_grams+nano_grams
    
    data_pack = []
    for exam in examples:
        d= copy.deepcopy(data)
        d.focus_txt = exam
        data_pack.append(d)
    return data_pack
    
    
    

In [9]:
ff = expand_data_ngram(train_data_packet[0])

In [11]:
ff[0]



In [None]:
import logging
from typing import List
from src.model_utils import Features
from torch.utils.data import Dataset
from transformers import AutoTokenizer
logger = logging.getLogger(__name__)
class ContextGenerationDatasetPicks(Dataset):
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        nb_records: int = 1,
        max_len=700,
        use_random_restrictive: bool = False,
        context_seperator: str = "[SEP]",
        use_special_token: bool = True,
        is_auto_encoder_data: bool = True,) -> None:
        super().__init__()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.nb_records = nb_records
        self.is_records_set = False
        self.use_random_restrictive = use_random_restrictive
        self.data: List[ContextualGenerationData] = []
        self.context_seperator = context_seperator
        self._context_delimiter_id = self.tokenizer.get_vocab()[self.context_seperator]
        self.use_special_token = use_special_token
        self._is_auto_encoder_data = is_auto_encoder_data

        if self._is_auto_encoder_data:
            logger.info("The model will be trained as an auto-encoder")
        else:
            logger.info("The model will be trained as a non auto-encoder")

        # Since we will be mainly training, we will set it to 1, during inference, we will set it to 2
        self.change_data_mode(1)

    def __len__(
        self,
    ):
        return self.nb_records

    def set_record(self, data):
        self.data = data
        self.nb_records = len(self.data)

    def add_record(self, row):
        self.data.append(row)
        self.nb_records = len(self.data)

    def __getitem__(self, index):
        return self.procesTexts(self.data[index])

    def change_data_mode(self, mode=1):
        self.mode = mode > 1
        
    def _sent_tokenize(self,text,focus)

    def procesTexts(self, data: ContextualGenerationData):

        passage = data.input
        clean_passage = " ".join(passage.replace("[SEP]", "").strip().split()).strip()
        passage_sentence_tokenized = clean_passage.strip().split()
        nb_words = len(passage_sentence_tokenized)
        
        section_point = data.boundary
        if section_point<0:
            section_point = round(
                (
                    np.random.uniform(
                        size=(1,),
                        low=self.section_boundary[0],
                        high=self.section_boundary[1],
                    )
                    * nb_words
                )[0]
            )

        composed_input = (
            " ".join(passage_sentence_tokenized[:section_point])
            + f" {self.context_seperator} "
            + " ".join(passage_sentence_tokenized[section_point:])
        )

        label_text = clean_passage if self._is_auto_encoder_data else data.output
        # apply the tokenizer to convert the texts to the appropriate input
        if not self.mode:
            label_pack = self.tokenizer(
                label_text,
                return_tensors="pt",
                # add_special_tokens=self.use_special_token
            )
            label_seq = label_pack["input_ids"].flatten()
            label_attention = label_pack["attention_mask"].flatten()

        passage_pack = self.tokenizer(
            composed_input,
            add_special_tokens=self.use_special_token,
            return_tensors="pt",
        )

        passage_seq = passage_pack["input_ids"].flatten()
        passage_attention = passage_pack["attention_mask"].flatten()

        num_tokens = passage_seq.shape[-1]

        if num_tokens > self.max_len:
            delimiter_points = passage_seq == self._context_delimiter_id
            delimiter_points_idx = delimiter_points.nonzero(as_tuple=True)[-1][0]
            if delimiter_points_idx > self.max_len:
                passage_seq = torch.concat(
                    [torch.Tensor([self._context_delimiter_id]).long(), passage_seq]
                )
                passage_attention = torch.concat(
                    [torch.Tensor([1]).long(), passage_attention]
                )

        if not self.mode:
            return Features(
                input_ids=passage_seq,
                attention_mask=passage_attention,
                labels=label_seq,
                decoder_attention_mask=label_attention,
                section_point=section_point,
            )
        else:
            return Features(
                input_ids=passage_seq,
                attention_mask=passage_attention,
                labels=[],
                decoder_attention_mask=[],
                section_point=section_point,
            )


In [None]:
class ContextGenerationDatasetBoundary(Dataset):
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        nb_records: int = 1,
        max_len=700,
        section_boundary=(0.25, 0.70),
        use_random_restrictive: bool = False,
        context_seperator: str = "[SEP]",
        use_special_token: bool = True,
        is_auto_encoder_data: bool = True,
    ) -> None:
        super().__init__()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.nb_records = nb_records
        self.is_records_set = False
        self.use_random_restrictive = use_random_restrictive
        self.section_boundary = section_boundary
        self.data: List[ContextualGenerationData] = []
        self.context_seperator = context_seperator
        self._context_delimiter_id = self.tokenizer.get_vocab()[self.context_seperator]
        self.use_special_token = use_special_token
        self._is_auto_encoder_data = is_auto_encoder_data

        if self._is_auto_encoder_data:
            logger.info("The model will be trained as an auto-encoder")
        else:
            logger.info("The model will be trained as a non auto-encoder")

        # Since we will be mainly training, we will set it to 1, during inference, we will set it to 2
        self.change_data_mode(1)

    def __len__(
        self,
    ):
        return self.nb_records

    def set_record(self, data):
        self.data = data
        self.nb_records = len(self.data)

    def add_record(self, row):
        self.data.append(row)
        self.nb_records = len(self.data)

    def __getitem__(self, index):
        return self.procesTexts(self.data[index])

    def change_data_mode(self, mode=1):
        self.mode = mode > 1

    def procesTexts(self, data: ContextualGenerationData):

        passage = data.input
        clean_passage = " ".join(passage.replace("[SEP]", "").strip().split()).strip()
        passage_sentence_tokenized = clean_passage.strip().split()
        nb_words = len(passage_sentence_tokenized)
        
        section_point = data.boundary
        if section_point<0:
            section_point = round(
                (
                    np.random.uniform(
                        size=(1,),
                        low=self.section_boundary[0],
                        high=self.section_boundary[1],
                    )
                    * nb_words
                )[0]
            )

        composed_input = (
            " ".join(passage_sentence_tokenized[:section_point])
            + f" {self.context_seperator} "
            + " ".join(passage_sentence_tokenized[section_point:])
        )

        label_text = clean_passage if self._is_auto_encoder_data else data.output
        # apply the tokenizer to convert the texts to the appropriate input
        if not self.mode:
            label_pack = self.tokenizer(
                label_text,
                return_tensors="pt",
                # add_special_tokens=self.use_special_token
            )
            label_seq = label_pack["input_ids"].flatten()
            label_attention = label_pack["attention_mask"].flatten()

        passage_pack = self.tokenizer(
            composed_input,
            add_special_tokens=self.use_special_token,
            return_tensors="pt",
        )

        passage_seq = passage_pack["input_ids"].flatten()
        passage_attention = passage_pack["attention_mask"].flatten()

        num_tokens = passage_seq.shape[-1]

        if num_tokens > self.max_len:
            delimiter_points = passage_seq == self._context_delimiter_id
            delimiter_points_idx = delimiter_points.nonzero(as_tuple=True)[-1][0]
            if delimiter_points_idx > self.max_len:
                passage_seq = torch.concat(
                    [torch.Tensor([self._context_delimiter_id]).long(), passage_seq]
                )
                passage_attention = torch.concat(
                    [torch.Tensor([1]).long(), passage_attention]
                )

        if not self.mode:
            return Features(
                input_ids=passage_seq,
                attention_mask=passage_attention,
                labels=label_seq,
                decoder_attention_mask=label_attention,
                section_point=section_point,
            )
        else:
            return Features(
                input_ids=passage_seq,
                attention_mask=passage_attention,
                labels=[],
                decoder_attention_mask=[],
                section_point=section_point,
            )


In [12]:
import numpy as np