In [1]:
# !pip install pandas
# !pip install arxivscraper
# !pip install transformers 
# !pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html
# !pip install scikit-learn
# !pip install gpt_2_simple
# !pip install sentencepiece

In [2]:
import arxivscraper as ax
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration,T5Tokenizer
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import torch
import warnings

warnings.filterwarnings('ignore')

In [3]:
# scraper = ax.Scraper(category='cs', t=10, filters={'categories':['cs.AI']})
# output = scraper.scrape()

# cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
# df = pd.DataFrame(output,columns=cols)

# df.to_csv("arxivData.csv", index=False)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("arxivData.csv")
data = df[["title", "abstract"]]

tokenizer = AutoTokenizer.from_pretrained("Callidior/bert2bert-base-arxiv-titlegen")
model = AutoModelForSeq2SeqLM.from_pretrained("Callidior/bert2bert-base-arxiv-titlegen")

model1 = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer1 = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
model1 = model1.to(device)

tokenizer2 = GPT2Tokenizer.from_pretrained("gpt2")
model2 = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
def GenTitlePreTrainedSeq2Seq(df):
    encoding = tokenizer(df["abstract"], return_tensors="pt")
    input_ids = encoding["input_ids"]
    attention_masks = encoding["attention_mask"]

    sample_output = model.generate(
        input_ids=input_ids, 
        max_length=40,
        num_beams=5, 
        temperature=0.7,
        top_k=50, 
        top_p=0.95, 
        early_stopping=True,
        do_sample=True
    )

    title = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    return title

def GenTitlePreTrainedT5(df):
    text =  "headline: " + df["abstract"]
    
    encoding = tokenizer1.encode_plus(text, return_tensors = "pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)

    beam_outputs = model1.generate(
        input_ids = input_ids, 
        attention_mask = attention_masks,
        max_length=40,
        num_beams=5, 
        no_repeat_ngram_size=2, 
        temperature=0.7,
        top_k=50, 
        top_p=0.95, 
        early_stopping=True,
        do_sample=True
    )
    
    result = tokenizer1.decode(beam_outputs[0], skip_special_tokens=True)
    return result

def preprocess(PreList):
    for i in PreList:
        try:
            PreList[PreList.index(i)] = i.split(' | ')[0]       
        except:
            continue
        try:
            PreList[PreList.index(i)] = i.split(' - ')[0]
        except:
            continue
        try:
            PreList[PreList.index(i)] = i.split(' — ')[0]
        except:
            continue 
            
    return PreList

def titleTotext(text):
    input_ids = tokenizer2.encode(text["title"], return_tensors='tf')
    greedy_output = model2.generate(input_ids, 
                                    max_length=150,
                                    num_beams=5, 
                                    no_repeat_ngram_size=2, 
                                    temperature=0.7,
                                    top_k=50, 
                                    top_p=0.95, 
                                    early_stopping=True,
                                    do_sample=True)

    result = tokenizer2.decode(greedy_output[0], skip_special_tokens=True)
    return result

In [6]:
titleSaveSeq = []
titleSavet5 = []
OriginalT = []
OriginalText = []
TitleToText = []

for i,j in data.sample(3).iterrows():
    Seq = GenTitlePreTrainedSeq2Seq(j)
    titleSaveSeq.append(Seq)
    t5 = GenTitlePreTrainedT5(j)
    titleSavet5.append(t5)
    T2T = titleTotext(j)
    TitleToText.append(T2T)
    OriginalT.append(j["title"])
    OriginalText.append(j["abstract"])

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [7]:
titleSaveSeqProc = preprocess(titleSaveSeq)
for i,j,k,l in zip(titleSaveSeqProc, titleSavet5, OriginalT, OriginalText):
    print("\nOriginal Text: -")
    print(l)
    print("\nOriginal Title: -")
    print(k)
    print("\nTitle From Seq2Seq Model: -")
    print(i)
    print("\nTitle From T5 Model: -")
    print(j)
    print("-"*75)


Original Text: -
controllers for autonomous systems that operate in safety-critical settings must account for stochastic disturbances. such disturbances are often modelled as process noise, and common assumptions are that the underlying distributions are known and/or gaussian. in practice, however, these assumptions may be unrealistic and can lead to poor approximations of the true noise distribution. we present a novel planning method that does not rely on any explicit representation of the noise distributions. in particular, we address the problem of computing a controller that provides probabilistic guarantees on safely reaching a target. first, we abstract the continuous system into a discrete-state model that captures noise by probabilistic transitions between states. as a key contribution, we adapt tools from the scenario approach to compute probably approximately correct (pac) bounds on these transition probabilities, based on a finite number of samples of the noise. we capture

In [8]:
for i,j in zip(OriginalText, TitleToText):
    print("\nOriginal Text: -")
    print(i)
    print("*"*75)
    print("\nCreated from Title text: -")
    print(j)
    print("-"*75)


Original Text: -
controllers for autonomous systems that operate in safety-critical settings must account for stochastic disturbances. such disturbances are often modelled as process noise, and common assumptions are that the underlying distributions are known and/or gaussian. in practice, however, these assumptions may be unrealistic and can lead to poor approximations of the true noise distribution. we present a novel planning method that does not rely on any explicit representation of the noise distributions. in particular, we address the problem of computing a controller that provides probabilistic guarantees on safely reaching a target. first, we abstract the continuous system into a discrete-state model that captures noise by probabilistic transitions between states. as a key contribution, we adapt tools from the scenario approach to compute probably approximately correct (pac) bounds on these transition probabilities, based on a finite number of samples of the noise. we capture