# Simple summarization of paper abstracts demo

## Read papers in PDF and collect abstract from the first page

In [9]:
import os
import fitz
top = os.getcwd()
papers = []

for root, dirs, files in os.walk(top, topdown=False):
    for fl in files:
        if "paper_" in fl:
            with fitz.open(fl) as doc:
                text = ""
                for page in doc:
                    text += page.get_text()
                    break
                papers.append(text)

In [13]:
abstracts = []
dividers = ["Abstract", "abstract", "ABSTRACT"]
finishers = ["INTRODUCTION", "Introduction", "introduction"]
for paper in papers:
    abstract = " ".join(paper.split("\n"))
    for div in dividers:
        if div in abstract:
            abstract = abstract.split(div)[1]
            break
    for fin in finishers:
        abstract = abstract.split(fin)[0]
    abstract = abstract.split("doi:")[0].split("DOI:")[0].strip("/").strip(":").strip("https")
    abstracts.append(abstract)

## Summarize using Pegasus transformer

In [14]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

summaries = []

model_name = "google/pegasus-xsum"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

for src_text in abstracts:
    batch = tokenizer([src_text], truncation=True, padding="longest", return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    summaries.append(tgt_text)

### Debugging section: Print abstracts and their summaries

In [15]:
for s in range(len(summaries)):
    print("Summary:\n" + summaries[s][0] + "\n ----------- Abstract:\n", abstracts[s])
    print("------------------------")

Summary:
All photographs are copyrighted.
 ----------- Abstract:
 ing with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from permissions@acm.org. © 2019 Association for Computing Machinery. XXXX-XXXX/2019/4-ART $15.00 https://doi.org/10.1145/nnnnnnn.nnnnnnn , Vol. 1, No. 1, Article . Publication date: April 2019. arXiv:1901.06796v3  [cs.CL]  11 Apr 2019 
------------------------
Summary:
The theme of this year's World Health Organization (WHO) World Health Day is autism spectrum disorder.
 ----------- Abstract:
  Autism spectrum disorder is a term used to describe a constellation of early-appearing social  communication deficits and repetitive sensory–motor behaviours associated with a strong genetic  component as well as other causes. The outlook for many individuals with autism spectrum  disorder today is brighter than it was 50 years ago; more people with the