In [7]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import random
import numpy as np
def contextualized_texts(texts):
    sentences = sent_tokenize(texts)
    data = []
    if len(sentences)<3:
        return []
    else:
        
        for i in range(len(sentences)-1):
            intp = ' '.join(sentences[:i+1][-3:]) + ' [SEP] ' + ' '.join(sentences[i+1:][:3])
            output = ' '.join(sentences[:i+1][-3:])
            data.append(dict(input=intp,output=output))
        return data
            
def contextualized_data(dataset):
    data_pack = []
    for data in dataset:
        pack = contextualized_texts(data)
        if len(pack)>3:
            pack = random.sample(pack,3)
        data_pack+=pack
        
    data_pack = pd.DataFrame(data_pack)
    data_pack= data_pack.drop_duplicates(subset = ["output"],keep="last")
    return data_pack

In [8]:
squad_train_data = pd.read_csv("curated_data/squad_train.csv")
squad_dev_data = pd.read_csv("curated_data/squad_dev.csv")
drop_train_data = pd.read_csv("curated_data/drop_train.csv")
drop_dev_data = pd.read_csv("curated_data/drop_dev.csv")
extra_train_data = pd.read_csv("curated_data/extra_data_train.csv")
sci_train_data = pd.read_csv("curated_data/sci_train.csv")
sci_dev_data = pd.read_csv("curated_data/sci_dev.csv")

rope_train_data = pd.read_csv("curated_data/rope_train.csv")
rope_dev_data = pd.read_csv("curated_data/rope_dev.csv")


squad_train_text = squad_train_data.input_text.unique()
squad_dev_text = squad_dev_data.input_text.unique()
drop_train_text = drop_train_data.input_text.unique()
drop_dev_text = drop_dev_data.input_text.unique()
extra_train_text = extra_train_data.input_text.unique()
sci_train_text = sci_train_data.input_text.unique()
rope_train_text = rope_train_data.input_text.unique()
rope_dev_text = rope_dev_data.input_text.unique()
train_data_list = np.concatenate([extra_train_text,
                                  sci_train_text,
                                  squad_train_text,
                                  drop_train_text,
                                  rope_train_text])

dev_data_list = np.concatenate([squad_dev_text,drop_dev_text,rope_dev_text])

In [78]:




squad_train_dataset = contextualized_data(squad_train_text)
squad_dev_dataset = contextualized_data(squad_dev_text)





drop_train_dataset = contextualized_data(drop_train_text)
drop_dev_dataset = contextualized_data(drop_dev_text)



extra_train_dataset = contextualized_data(extra_train_text)




sci_dev_text = sci_dev_data.input_text.unique()

sci_train_dataset = contextualized_data(sci_train_text)
sci_dev_dataset = contextualized_data(sci_dev_text)




rope_train_dataset = contextualized_data(rope_train_text)
rope_dev_dataset = contextualized_data(rope_dev_text)

In [81]:
dev_data_list.shape

(6642,)

In [4]:
train_data = pd.concat([squad_train_dataset,drop_train_dataset,extra_train_dataset,sci_train_dataset,rope_train_dataset],axis=0)
dev_data = pd.concat([squad_dev_dataset,drop_dev_dataset,sci_dev_dataset,rope_dev_dataset],axis=0)


In [112]:
train_data.to_csv('processed_data/context_generation_train.csv')
dev_data.to_csv('processed_data/context_generation_dev.csv')

In [14]:
import json

def strip_newline(value):
    return ' '.join(value.splitlines())
rc_test= [strip_newline(d['plot']) for d in json.load(open('curated_data//ParaphraseRC_test.json',encoding="utf-8"))]
rc_train= [strip_newline(d['plot'])  for d in json.load(open('curated_data//ParaphraseRC_tr.json',encoding="utf-8"))]
rc_dev =  [strip_newline(d['plot']) for d in json.load(open('curated_data//ParaphraseRC_dev.json',encoding="utf-8"))]

In [28]:
len()

13

In [15]:
import re
from nltk import ngrams
def normalize_whitespace(string):
    return re.sub(r"(\s)\1{1,}", r"\1", string)


def cleanDocument(document):
    document = re.sub(r"\[\d+\s?\]", "", document)
    document = re.sub(r"(\d\.\s+|[a-z]\)\s+|•\s+|[A-Z]\.\s+|[IVX]+\.\s+)", "", document)
    document = normalize_whitespace(document.replace("\n", "")).strip()
    return document
def collateData(data):
    data_pack = []
    for texts in data:
        texts = cleanDocument(texts)
        sent_list = sent_tokenize(texts)
        
        if len(sent_list)>3:
            sentences_pack = [' '.join(l) for l in list(ngrams(sent_list,3))]
        else:
            sentences_pack = [' '.join(sent_list)]
        for ss in sentences_pack:
            a= dict(input=ss,output=ss)
            data_pack.append(a)
    data_pack = pd.DataFrame(data_pack)
    data_pack= data_pack.drop_duplicates(subset = ["output"],keep="last")
    return data_pack

In [None]:
contextualized_texts(rc_train[0])

In [17]:
rc_train_dataset = pd.concat([collateData(rc_train) ,
                              collateData(train_data_list),
                              collateData(rc_dev+rc_test[:800])],axis=0)
rc_dev_dataset = pd.concat([collateData(rc_test[800:]),
                            collateData(dev_data_list)])

In [23]:
rc_train_dataset.to_csv('processed_data_new/context_generation_train.csv')
rc_dev_dataset.to_csv('processed_data_new/context_generation_dev.csv')

In [19]:
sent_tokenize(rc_dev_dataset.input.values[-800:][500])

['A 2nd grade classroom photo was held today.',
 'They all gathered by age.',
 'Jeremy was the oldest.']

In [86]:
dev_dataset = collateData(dev_data_list)

In [21]:
rc_dev_dataset.shape,rc_train_dataset.shape

((28957, 2), (526386, 2))

In [53]:
sent_tokenize(rc_train_dataset.input.values[0])

['Set in the second half of the 22nd century, Mars has been 84% terraformed, allowing humans to walk on the surface without pressure suits.',
 'Martian society has become matriarchal, with women in most positions of authority.',
 'The story concerns police officer Melanie Ballard (Natasha Henstridge), second in command of a team alongside Sergeant Jericho (Jason Statham) sent to a remote mining outpost to transport prisoner Desolation Williams (Ice Cube).',
 'Arriving at the remote mining town, Ballard finds all of the people missing.',
 'She learns that they had discovered an underground doorway created by an ancient Martian civilization.']

In [55]:
rc_train_dataset.dropna(axis=0,inplace=True)

In [24]:
rc_train_dataset.shape

(526386, 2)

In [25]:
import torch

In [26]:
x= torch.Tensor([1,42,3,56])

In [None]:
x==

In [91]:
import re
pattern =  r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

sent="http://classes. midlandstech.edu/carterp/courses/bio225/chap05/lecture1. htm." 
re.sub(pattern,"#url#",sent)

'#url#. #url#. htm.'

In [44]:
# Python code to find the URL from an input string
# Using the regular expression
import re


def Find(string):

	# findall() has been used
	# with valid conditions for urls in string
	regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
	url = re.findall(regex, string)
	return [x[0] for x in url]


# Driver Code
string = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of https://www.geeksforgeeks.org/'
print("Urls: ", Find(string))


In [1]:
from transformers import EncoderDecoderModel

In [2]:
from library import ContextualizedSentenceTransformer
import torch

In [3]:
context_sentence_transformer = ContextualizedSentenceTransformer(model_name='sentence-transformers/all-mpnet-base-v2')

In [1]:
from transformers import Seq2SeqTrainer, BertGenerationDecoder
from transformers.models.bart.modeling_bart import BartDecoder
from transformers import BartTokenizer
import torch.nn as nn

In [2]:
tokeniser = BartTokenizer.from_pretrained('facebook/bart-base')

In [3]:
tokeniser.sep_token

'</s>'

In [None]:

decoder = BartDecoder.from_pretrained('facebook/bart-base')

In [None]:
class ContextualGenerator(nn.Module):
    def __init__(self, decoder_model,sentence_transformer_model= 'sentence-transformers/all-mpnet-base-v2') -> None:
        super().__init__()
        self._context_sentence_encoder = ContextualizedSentenceTransformer(model_name=sentence_transformer_model,
                                                                           clean_context=True)
        self._decoder_model = decoder_model
    
    def forward(self, input_seq,
                attention_mask,
                label, 
                decoder_attention_mask)
        
        

In [127]:
m_trainer = Seq2SeqTrainer(model=context_sentence_transformer,)

In [128]:
m_trainer.save_model('context_encoder_model/encoder_model')

Saving model checkpoint to context_encoder_model/encoder_model
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


In [132]:
from transformers import MPNetConfig
config = MPNetConfig.from_pretrained('sentence-transformers/all-mpnet-base-v2')

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/nlplab/.cache/huggingface/hub/models--sentence-transformers--all-mpnet-base-v2/snapshots/bd44305fd6a1b43c16baf96765e2ecb20bca8e1d/config.json
Model config MPNetConfig {
  "_name_or_path": "microsoft/mpnet-base",
  "architectures": [
    "MPNetForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "mpnet",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.23.1",
  "vocab_size": 30527
}



In [134]:
config.save_pretrained('context_encoder_model/encoder_model/')

Configuration saved in context_encoder_model/encoder_model/config.json


In [124]:
torch.save(context_sentence_transformer.state_dict,'context_encoder_model/context_encoder.json')

In [None]:
decoder_model = 

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(encoder_pretrained_model_name_or_path='context_encoder_model/encoder_model',
                                                            decoder_pretrained_model_name_or_path='roberta-base'
                                                            )

In [147]:
chat_data=["Hello world </s> Mate was there last week"]
cc= context_sentence_transformer.tokenize(chat_data,True)

In [152]:
help(model.encoder.forward)

Help on method forward in module transformers.models.mpnet.modeling_mpnet:

forward(input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **kwargs) -> Union[Tuple[torch.Tensor], transformers.modeling_outputs.BaseModelOutputWithPooling] method of transformers.models.mpnet.modeling_mpnet.MPNetModel instance
    The [`MPNetModel`] forward method, overrides the `__call__` special method.
    
    <Tip>
    
    Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
    the latter silently ignores them.
    
   

In [148]:
dat=model.encoder(**cc,clean_context=True)

In [149]:
dat['last_hidden_state'].shape

torch.Size([1, 8, 768])

In [2]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail",'3.0.0')

Downloading and preparing dataset cnn_dailymail/3.0.0 to /home/nlplab/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /home/nlplab/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset['train'][0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char