# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import json
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from itertools import islice
import math

In [3]:
import torch
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Get Dataset from arXiv and PubMed(10 each)

In [4]:
def get_data_from_arXiv_pubMed():
    content={}
    articles={}
    with open("Dataset/arxiv-dataset/test.txt","r") as w:
        for index,line in enumerate(w.readlines()):
            if index==1000:
                break
            content=json.loads(line)
            articles[content["article_id"]]=content
    pubmedArticles={}
    with open("Dataset/pubmed-dataset/test.txt","r") as w:
        for index,line in enumerate(w.readlines()):
            if index==1000:
                break
            content=json.loads(line)
            pubmedArticles[content["article_id"]]=content

    return articles, pubmedArticles


### GET MODELS (FINE TUNED and VANILLA)

In [5]:
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration

def get_model_and_tokenizer(modelName='facebook/bart-large-cnn'):
    tokenizer = BartTokenizer.from_pretrained(modelName)
    model = BartForConditionalGeneration.from_pretrained(modelName).to(device)

    return model, tokenizer

    
    

In [6]:
def get_fine_tuned_model(modelName="BART-E5-ALL.pth"):
    model=BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
    model.load_state_dict(torch.load(modelName))
    return model

In [7]:
def section_text_cleaning(sectionText):
    section_text=[]
    math_mappings={}
    for section in sectionText:
        cleaned_text=[]
        for i in section:
            text=i.replace("@xcite","")
            for j in text.split():
                if "@xmath" in j:
                    if j in math_mappings:
                        text=text.replace(j,math_mappings[j])
                    else:
                        math_mappings[j]="[equation"+str(len(math_mappings))+"]"
                        text=text.replace(j,math_mappings[j])
            cleaned_text.append(text)
        section_text.append(cleaned_text)
    return section_text

In [8]:
def doc_text_cleaning(docText):
    cleaned_text=[]
    math_mappings={}
    for i in docText:   
        text=i.replace("@xcite","")
        for j in text.split():
            if "@xmath" in j:
                if j in math_mappings:
                    text=text.replace(j,math_mappings[j])
                else:
                    math_mappings[j]="[equation"+str(len(math_mappings))+"]"
                    text=text.replace(j,math_mappings[j])
        cleaned_text.append(text)
        
    return cleaned_text

In [9]:
import math
def handle_size(tokens,default=1000,percent=0.2):
    chunked_ip=[]
    
    #print(tokens.shape[1]/(default-(default*percent)))
    for index in range(math.ceil(tokens.shape[1]/(default))):
        if index==0:
            chunked_ip.append(tokens[ : ,0:default])
        else:
            st_index=index*math.floor(default-(default*percent))
            
            
            chunked_ip.append(tokens[ : ,st_index: st_index+default])
    
    return chunked_ip

In [10]:
def create_summary_from_sections(section_texts,model,tokenizer):
    summary=""
    for i in section_texts:
        #print("before")
        #handle size
        
        tokens=tokenizer([" ".join(i)],max_length=1000,padding="max_length",return_tensors="pt")
        #print(tokens['input_ids
        #print(tokens["input_ids"].shape[1])
        if tokens["input_ids"].shape[1]>=1024:
            chunked_ip=handle_size(tokens["input_ids"])
            for j in chunked_ip:
                #print(j.shape)
                gen_tokens=model.generate(j.to(device),num_beams=4, min_length=50,max_length=100, early_stopping=True)
                decode=tokenizer.batch_decode(gen_tokens,skip_special_tokens=True)
                summary+=" ".join(decode)+"\n"
            continue
        
        #print("error",tokens['input_ids'].shape)
        gen_tokens=model.generate(tokens["input_ids"].to(device),num_beams=4, min_length=50,max_length=100, early_stopping=True)
        decode=tokenizer.batch_decode(gen_tokens,skip_special_tokens=True)
        summary+=" ".join(decode)+"\n"
    return summary

In [11]:
import evaluate
def evaluate_summary(summary,reference):
    # Initialize the rouge metric
    rouge = evaluate.load("rouge")
    results = rouge.compute(predictions=[summary], references=[reference])
    return results



## Centrality Functions

In [12]:
from sentence_transformers import SentenceTransformer

In [13]:

from sklearn.metrics.pairwise import cosine_similarity
def create_cosine_matrix(threshold,sent_emb_map):
    
    #sentence_matrix=[]        
    # Calculate pairwise cosine similarity
    #for sentence in sentBow[cluster]:
    #print(torch.stack(list(cluster_embeddings[cluster].values())).squeeze(1).shape)
    similarity_matrix = cosine_similarity(list(sent_emb_map.values()))
    for i in range(0,similarity_matrix.shape[0]):
        for j in range(0,similarity_matrix.shape[0]):
            if similarity_matrix[i,j]>=threshold:
                similarity_matrix[i,j]=1
            else:
                similarity_matrix[i,j]=0
        # normalize row by div each element in the row with row sum
        similarity_matrix[i]=similarity_matrix[i]/sum(similarity_matrix[i])
        
    return similarity_matrix

In [14]:
import numpy as np
def compute_transition_matrix(similarity_matrix):
    #develop aperodic ,irreducible transition matrix
    #damping factor [0.1,0.2]
    d=0.1
    #uniform probality
    uniform_matrix= np.full((1, similarity_matrix.shape[0]), 1 / similarity_matrix.shape[0])
    dU=d*uniform_matrix
    d_B=(1-d)*similarity_matrix
    transition_Matrix=dU+d_B
    transition_Matrix=transition_Matrix.T
    return transition_Matrix

In [15]:
import math
def compute_centrality(transition_Matrix,threshold):
    N=transition_Matrix.shape[0]
    p0=np.full(N, 1 / N)
    t=0
    while True:
        t=t+1
        #p1=transition_Matrix*p0
        p1 = transition_Matrix @ p0
        error=np.linalg.norm(p1 - p0)
        # print("p1",p1.shape,p0.shape)
        # print("loop",t)
        if error<threshold:
            return p1
        p0=p1
    

In [16]:

def compute_lexrank(sentMatrices):
   
    t_matrix=compute_transition_matrix(sentMatrices)
    sent_rank=compute_centrality(t_matrix, 1e-6).tolist()
    return sent_rank  
        
        
        

In [17]:
# tokenizer=BertTokenizer.from_pretrained("bert-base-cased")
# model=BertModel.from_pretrained("bert-base-cased").to(device)
sent_model=SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2').to(device)
def centroidTfIdf(sentences):
    if not all(isinstance(sentence, str) for sentence in sentences):
        print("All elements in `sentences` must be strings.",sentences)
   
        
    #sentences=sent_tokenize(" ".join(clusterContent[clusterContent["cluster"]==cluster]["content"]))
    embeddings = sent_model.encode(sentences)
    #print(type(sentences))
    #print(type(embeddings))
    sent_list=dict(zip(sentences,embeddings))
    #print("passed")
  
    return sent_list


In [55]:
def getDocSents_Ranking(text):
    #compression rate
    r=0.2#0.2
    
    sent_emb_map=centroidTfIdf(text)
    sim_matrix=create_cosine_matrix(0.5,sent_emb_map)# should be 0.5
    sent_ranks=compute_lexrank(sim_matrix)
    mapped_score=tuple(zip(list(sent_emb_map.keys()),sent_ranks))
    
    sorted_scores=sorted(mapped_score,key=lambda x:x[1],reverse=True)
    n=math.ceil(len(sent_emb_map)*r)
    ranked_sents=list(dict(islice(sorted_scores,0,n)).keys())
    ordered_sents=[]
    for i in sent_emb_map:
        if i in ranked_sents:
            ordered_sents.append(i)
    count_tokens=0
    removed_sents=[]
    for i in ordered_sents:
        if len(i.split())+count_tokens > 1000:
            break
        else:
            removed_sents.append(i)
            count_tokens+=len(i.split())
    #print(removed_sents)
    return removed_sents

## Put back the @xformula and ref:

In [19]:
def replace_placeholders_text(summary,formula,refs):
    refs=dict(zip(list(refs.values()),list(refs.keys())))
    formula=dict(zip(list(formula.values()),list(formula.keys())))
    
       
    for i in formula:
        summary=summary.replace(i,formula[i])    
    return summary
    

## create summary from summaries

In [20]:
def create_summary_from_summary(section_texts,model,tokenizer):
   
    summary=""
      
    #print("before")
    #handle size
    
    tokens=tokenizer(section_texts ,max_length=1000,padding="max_length",return_tensors="pt")
    #print(tokens['input_ids
    #print("here",tokens["input_ids"][0].shape)
    if tokens["input_ids"].shape[1]>=1024:
        chunked_ip=handle_size(tokens["input_ids"],1000,0.4)
        for j in chunked_ip:
            #print(j.shape)
            gen_tokens=model.generate(j.to(device),num_beams=4, min_length=50,max_length=100, early_stopping=True)
            decode=tokenizer.batch_decode(gen_tokens,skip_special_tokens=True)
            summary+=" ".join(decode)+"\n"
        return summary
    
    #print("error",tokens['input_ids'].shape)
    gen_tokens=model.generate(tokens["input_ids"].to(device),num_beams=4, min_length=150,max_length=300, early_stopping=True)
    decode=tokenizer.batch_decode(gen_tokens,skip_special_tokens=True)
    summary+=" ".join(decode)+"\n"
    return summary

In [26]:
article="""In late September the railroads cut mail service 
in and out of Moscow, effectively decapitating the postal system and 
forcing officials to draft a fleet of trucks to move letters in and 
out of the city. What is clear is that the Post 
Office and the Railway Ministry both suffer from what ails every Russian 
venture, private and public alike: Nobody pays his bills. But oddly, real signs of public distress 
are not particularly common, perhaps because the system rarely seems 
to shed a part as big as a postal system. ``People are 
even coming to us, searching for mail that was sent weeks ago.'' Exactly why all this has 
rumbled to a halt is in some dispute. The government has specified nearly 40 categories 
of freight which the railroads must carry for next to nothing. A second airport is still 
demanding 3 million rubles for past-due debts. So much mail is backed 
up that post offices in Moscow and elsewhere have simply stopped accepting 
out-of-town mail, except for areas that can be easily reached by truck. And customers are getting angry. Air mail, which 
amounts to one of every four or five letters, was also suspended at 
one of Moscow's major airports until this week, when the Post Office 
coughed up 5 million rubles for old bills."""

In [36]:
from nltk import sent_tokenize
def sentSplitter(content):
    sentences=sent_tokenize(content)

    return sentences

In [37]:
import re
articles=[]
for cluster in os.listdir(DATA_LOCATION):
    if cluster==".DS_Store" or cluster!="27":
        continue
    
    #print(cluster)
    for index,document in enumerate(os.listdir(DATA_LOCATION+cluster)):
        content=open(DATA_LOCATION+cluster+"/"+document,"r").read()
        sents=sentSplitter(content)
        articles.append(sents)



In [38]:
DATA_LOCATION="Dataset/DUC2004/DUC-2004-Dataset-master/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs/"

In [19]:
References=("Dataset/DUC2004/DUC-2004-Dataset-master/reference/")
summaryReferences={}

In [20]:
import re
for reference in os.listdir(References):
    if ".txt" not in reference:
        print(reference)
    else:
        task,_=reference.split("_")
        task=re.search(r"[0-9]+",task)[0]
        with open(References+reference,"r") as f:
            summ=f.read()
            if task in summaryReferences:
                
                summaryReferences[task].append(summ)
            else:
                summaryReferences[task]=[summ]
                

In [12]:


#initialize model
model, tokenizer = get_model_and_tokenizer()


#select one article to summarize


#clean the section text
section_text=section_text_cleaning(articles)

#generate summary
summary=create_summary_from_sections(section_text,model, tokenizer)


print("summary",summary)

#evaluate summary
print(evaluate_summary(summary,[summaryReferences['27']]))




NameError: name 'articles' is not defined

# Running pipeline

In [None]:
import pickle
#get data
from evaluate import load
from tqdm import tqdm
arXiv_articles, pubMed_articles=get_data_from_arXiv_pubMed()
bertscore = load("bertscore")
bleu = evaluate.load("google_bleu")
rouge = evaluate.load("rouge")
#initialize model
model, tokenizer = get_model_and_tokenizer()

fine_model = get_fine_tuned_model()

pubMed_scores={"rouge":[],"bleu":[],"bertScore":[]}
arXiv_scores={"rouge":[],"bleu":[],"bertScore":[]}
section_summary=[]
section_fine_summary=[]
ranked_summaries=[]
ranked_fine_summaries=[]
try:
    for index,value in enumerate(tqdm(arXiv_articles)):
        article=arXiv_articles[value]
        section_text=section_text_cleaning(article['sections'])
        article_text=doc_text_cleaning(article["article_text"])
        doc_sentences=getDocSents_Ranking(article_text)
        ranked_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        ranked_fine_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        #generate summary
        summary=create_summary_from_sections(section_text,model, tokenizer)
        fine_tuned_summary=create_summary_from_sections(section_text,fine_model, tokenizer)
        #print("summary",summary)
        overall_summary=create_summary_from_summary([summary],model,tokenizer)
        overall_fine_summary=create_summary_from_summary([fine_tuned_summary],fine_model,tokenizer)
        #evaluate summary
        if index%50==0:
            section_fine_summary.append(overall_fine_summary)
            section_summary.append(overall_summary)
            ranked_summaries.append(ranked_summary)
            ranked_fine_summaries.append(ranked_fine_summary)
        #evaluaate
        reference=" ".join(article['abstract_text']).replace("<S>","").replace("</S>","")
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[overall_summary],references=[reference], lang="en"))
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[overall_fine_summary],references=[reference], lang="en"))
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[ranked_summary],references=[reference], lang="en"))
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[ranked_fine_summary],references=[reference], lang="en"))

        #BLEU
        arXiv_scores["bleu"].append(bleu.compute(predictions=[overall_summary],references=[reference]))
        arXiv_scores["bleu"].append(bleu.compute(predictions=[overall_fine_summary],references=[reference]))
        arXiv_scores["bleu"].append(bleu.compute(predictions=[ranked_summary],references=[reference]))
        arXiv_scores["bleu"].append(bleu.compute(predictions=[ranked_fine_summary],references=[reference]))

        #ROUGE
        arXiv_scores["rouge"].append(rouge.compute(predictions=[overall_summary],references=[reference]))
        arXiv_scores["rouge"].append(rouge.compute(predictions=[overall_fine_summary],references=[reference]))
        arXiv_scores["rouge"].append(rouge.compute(predictions=[ranked_summary],references=[reference]))
        arXiv_scores["rouge"].append(rouge.compute(predictions=[ranked_fine_summary],references=[reference]))
        if index%100==0:
            pickle.dump(arXiv_scores,open("arxiv_Scores.pickle","wb"))
            #pickle.dump(pubMed_scores,open("pubMed_scores.pickle","wb"))
            torch.cuda.empty_cache()
    for index,value in enumerate(tqdm(pubMed_articles)):
        article=pubMed_articles[value]
        section_text=section_text_cleaning(article['sections'])
        article_text=doc_text_cleaning(article["article_text"])
        doc_sentences=getDocSents_Ranking(article_text)
        ranked_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        ranked_fine_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        #generate summary
        summary=create_summary_from_sections(section_text,model, tokenizer)
        fine_tuned_summary=create_summary_from_sections(section_text,fine_model, tokenizer)
        #print("summary",summary)
        overall_summary=create_summary_from_summary([summary],model,tokenizer)
        overall_fine_summary=create_summary_from_summary([fine_tuned_summary],fine_model,tokenizer)
        #evaluate summary
        if index%50==0:
            section_fine_summary.append(overall_fine_summary)
            section_summary.append(overall_summary)
            ranked_summaries.append(ranked_summary)
            ranked_fine_summaries.append(ranked_fine_summary)
        #evaluaate
        reference=" ".join(article['abstract_text']).replace("<S>","").replace("</S>","")
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[overall_summary],references=[reference], lang="en"))
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[overall_fine_summary],references=[reference], lang="en"))
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[ranked_summary],references=[reference], lang="en"))
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[ranked_fine_summary],references=[reference], lang="en"))

        #BLEU
        pubMed_scores["bleu"].append(bleu.compute(predictions=[overall_summary],references=[reference]))
        pubMed_scores["bleu"].append(bleu.compute(predictions=[overall_fine_summary],references=[reference]))
        pubMed_scores["bleu"].append(bleu.compute(predictions=[ranked_summary],references=[reference]))
        pubMed_scores["bleu"].append(bleu.compute(predictions=[ranked_fine_summary],references=[reference]))

        #ROUGE
        pubMed_scores["rouge"].append(rouge.compute(predictions=[overall_summary],references=[reference]))
        pubMed_scores["rouge"].append(rouge.compute(predictions=[overall_fine_summary],references=[reference]))
        pubMed_scores["rouge"].append(rouge.compute(predictions=[ranked_summary],references=[reference]))
        pubMed_scores["rouge"].append(rouge.compute(predictions=[ranked_fine_summary],references=[reference]))
        if index%100==0:
            #pickle.dump(arXiv_scores,open("arxiv_Scores.pickle","wb"))
            pickle.dump(pubMed_scores,open("pubMed_scores.pickle","wb"))
            torch.cuda.empty_cache()
except Exception as e:
    print("error",e)
    
finally:
    pickle.dump(arXiv_scores,open("arxiv_Scores.pickle","wb"))
    pickle.dump(pubMed_scores,open("pubMed_scores.pickle","wb"))
    

In [23]:
pickle.dump(section_fine_summary,open("fine summary.pickle","wb"))
#pickle.dump(ranked_summaries,open("ranked summary.pickle","wb"))
#pickle.dump(section_summary,open("summary.pickle","wb"))
pickle.dump(ranked_fine_summaries,open("ranked fine summary.pickle","wb"))

In [22]:
import pickle
#get data
from evaluate import load
from tqdm import tqdm
arXiv_articles, pubMed_articles=get_data_from_arXiv_pubMed()
bertscore = load("bertscore")
bleu = evaluate.load("google_bleu")
rouge = evaluate.load("rouge")
#initialize model
model, tokenizer = get_model_and_tokenizer()

fine_model = get_fine_tuned_model()

pubMed_scores={"rouge":[],"bleu":[],"bertScore":[]}
arXiv_scores={"rouge":[],"bleu":[],"bertScore":[]}
section_summary=[]
section_fine_summary=[]
ranked_summaries=[]
ranked_fine_summaries=[]
try:
    for index,value in enumerate(tqdm(arXiv_articles)):
        if index<900:
            continue
        article=arXiv_articles[value]
        #print(value)
        section_text=section_text_cleaning(article['sections'])
        article_text=doc_text_cleaning(article["article_text"])
        doc_sentences=getDocSents_Ranking(article_text)
        #ranked_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        ranked_fine_summary=create_summary_from_summary(" ".join(doc_sentences),fine_model, tokenizer)
        #generate summary
        #summary=create_summary_from_sections(section_text,model, tokenizer)
        fine_tuned_summary=create_summary_from_sections(section_text,fine_model, tokenizer)
        #print("summary",summary)
        #overall_summary=create_summary_from_summary([summary],model,tokenizer)
        overall_fine_summary=create_summary_from_summary([fine_tuned_summary],fine_model,tokenizer)
        #print("summary",overall_fine_summary)
        #evaluate summary
        if index%50==0:
            section_fine_summary.append(overall_fine_summary)
            ranked_fine_summaries.append(ranked_fine_summary)
        #evaluaate
        reference=" ".join(article['abstract_text']).replace("<S>","").replace("</S>","")
       
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[overall_fine_summary],references=[reference], lang="en"))
        #arXiv_scores["bertScore"].append(bertscore.compute(predictions=[ranked_summary],references=[reference], lang="en"))
        arXiv_scores["bertScore"].append(bertscore.compute(predictions=[ranked_fine_summary],references=[reference], lang="en"))

        #BLEU
        #arXiv_scores["bleu"].append(bleu.compute(predictions=[overall_summary],references=[reference]))
        arXiv_scores["bleu"].append(bleu.compute(predictions=[overall_fine_summary],references=[reference]))
        #arXiv_scores["bleu"].append(bleu.compute(predictions=[ranked_summary],references=[reference]))
        arXiv_scores["bleu"].append(bleu.compute(predictions=[ranked_fine_summary],references=[reference]))

        #ROUGE
        #arXiv_scores["rouge"].append(rouge.compute(predictions=[overall_summary],references=[reference]))
        arXiv_scores["rouge"].append(rouge.compute(predictions=[overall_fine_summary],references=[reference]))
        #arXiv_scores["rouge"].append(rouge.compute(predictions=[ranked_summary],references=[reference]))
        arXiv_scores["rouge"].append(rouge.compute(predictions=[ranked_fine_summary],references=[reference]))
        if index%100==0:
            pickle.dump(arXiv_scores,open("arxiv_Scores_ranked_fine_epoch_5_901_1000.pickle","wb"))
            #pickle.dump(pubMed_scores,open("pubMed_scores_ranked_fine.pickle","wb"))
            torch.cuda.empty_cache()
    for index,value in enumerate(tqdm(pubMed_articles)):
        break
        article=pubMed_articles[value]
        section_text=section_text_cleaning(article['sections'])
        article_text=doc_text_cleaning(article["article_text"])
        doc_sentences=getDocSents_Ranking(article_text)
        #ranked_summary=create_summary_from_summary(" ".join(doc_sentences),model, tokenizer)
        ranked_fine_summary=create_summary_from_summary(" ".join(doc_sentences),fine_model, tokenizer)
        #generate summary
        #summary=create_summary_from_sections(section_text,model, tokenizer)
        fine_tuned_summary=create_summary_from_sections(section_text,fine_model, tokenizer)
        #print("summary",summary)
        #overall_summary=create_summary_from_summary([summary],model,tokenizer)
        overall_fine_summary=create_summary_from_summary([fine_tuned_summary],fine_model,tokenizer)
        #evaluate summary
        if index%50==0:
           
            section_fine_summary.append(overall_fine_summary)
            ranked_fine_summaries.append(ranked_fine_summary)
        #evaluaate
        reference=" ".join(article['abstract_text']).replace("<S>","").replace("</S>","")
        
        #pubMed_scores["bertScore"].append(bertscore.compute(predictions=[overall_summary],references=[reference], lang="en"))
        
        #pubMed_scores["bertScore"].append(bertscore.compute(predictions=[ranked_summary],references=[reference], lang="en"))
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[ranked_fine_summary],references=[reference], lang="en"))
        pubMed_scores["bertScore"].append(bertscore.compute(predictions=[overall_fine_summary],references=[reference], lang="en"))

        #BLEU
        #pubMed_scores["bleu"].append(bleu.compute(predictions=[overall_summary],references=[reference]))
        
        #pubMed_scores["bleu"].append(bleu.compute(predictions=[ranked_summary],references=[reference]))
        pubMed_scores["bleu"].append(bleu.compute(predictions=[ranked_fine_summary],references=[reference]))
        pubMed_scores["bleu"].append(bleu.compute(predictions=[overall_fine_summary],references=[reference]))

        #ROUGE
        #pubMed_scores["rouge"].append(rouge.compute(predictions=[overall_summary],references=[reference]))
       
        #pubMed_scores["rouge"].append(rouge.compute(predictions=[ranked_summary],references=[reference]))
        pubMed_scores["rouge"].append(rouge.compute(predictions=[ranked_fine_summary],references=[reference]))
        pubMed_scores["rouge"].append(rouge.compute(predictions=[overall_fine_summary],references=[reference]))
        if index%100==0:
            #pickle.dump(arXiv_scores,open("arxiv_Scores.pickle","wb"))
            pickle.dump(pubMed_scores,open("pubMed_scores_ranked_fine_epoch_5_201_204.pickle","wb"))
            torch.cuda.empty_cache()
except Exception as e:
    print("error",e)
    
finally:
    pickle.dump(arXiv_scores,open("arxiv_Scores_ranked_fine_epoch_5_901-1000.pickle","wb"))
    #pickle.dump(pubMed_scores,open("pubMed_scores_ranked_epoch_5_201_204_complete.pickle","wb"))
    

  model.load_state_dict(torch.load(modelName))
  0%|          | 0/1000 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [24:36<00:00,  1.48s/it]
  0%|          | 0/1000 [00:00<?, ?it/s]
