# Arxiv quick survey generator

Sometimes when I want to learn more about a topic by reading some papers, it is hard to find which papers are worth reading. This script uses chatgpt to help with that.

The approach is simple:
1. Given the topic of interest, query arxiv for documents.
2. Extract the abstract and conclusion from the pdf.
3. Send them to chat-gpt to answer some basic questions about the article
4. Rresent the results in a table and save them in a csv file.


In [5]:
import arxiv
import openai
import numpy as np
from numpy.linalg import norm
import pypdfium2 as pdfium   # used to parse the pdf
import re
import time
import os
import pandas as pd
from IPython.display import display
import random

axClient = arxiv.Client()

EMODEL = "text-embedding-ada-002"  # embedding model is openai
CMODEL = "gpt-3.5-turbo"  # chat model
SEARCHMULT = 2  # multiply this by the desired number of papers for search
PDFCACHEDIR = '/tmp/pdfs/'
os.makedirs(PDFCACHEDIR, exist_ok=True)

# Extracting conclusions and possibly abstracts are done with heuristic methods that 
# use these parameters
MAXTITLELEN = 50 # title of conclusion can be as long as this
MINCONCLLEN = 1000  # epect conclusions to be longer than these many chars
TTLEXP = 100  # to verify we have a section title, lookahead and back

RETRIES = 3  # rety getting papers

def getPapers(query: str = 'tree search', 
              num: int = 10, 
              simThreshold: float = 0.75) -> list[dict()]:
    """
    - Queries arxiv for the topic of interest
    - finds cosine similarity between the summary and the query string
    - returns a list of {paper, cosineSim} in order of decreasing sim
    TBD: whether this ordering is differnet from the arxiv ordering
    """
    search = arxiv.Search(
      query = query,
      max_results = num * SEARCHMULT,
      sort_by = arxiv.SortCriterion.Relevance
    )
    results = axClient.results(search)
    qel = openai.Embedding.create(input=query, 
                                  model=EMODEL)['data'][0]['embedding']
    qEmb = np.array(qel)
    qNorm = norm(qEmb)
    docNscores = []
    for r in results:
        rs = {'paper': r}
        pEmb = np.array(openai.Embedding.create(input = r.summary,
                                         model = EMODEL)['data'][0]['embedding'])
        rs['score'] = np.dot(pEmb, qEmb)/(qNorm * norm(pEmb))
        if rs['score'] >= simThreshold:
            docNscores.append(rs)
    docNscores.sort(key=lambda x: x['score'], reverse=True)
    return docNscores

def addChunks(docNscores: list[dict()]) -> list[dict()]:
    """
     Adds abstract and conclusion to the input data
    """
    dsc = []
    success = False
    cnt = 0
    for ds in docNscores:
        cnt = 0
        success = False
        fname = os.path.join(PDFCACHEDIR, ds['paper']._get_default_filename())
        if os.path.isfile(fname):
            success = True
        while not success and cnt < RETRIES:
            cnt += 1
            try:
                fname = ds['paper'].download_pdf(PDFCACHEDIR)
                success = True
            except Exception as e:
                print(e)
            time.sleep(0.5 + random.random() * 2)
        if not success:
            print(f"Cannot download {ds['paper'].title}")
            continue
        pdf = pdfium.PdfDocument(fname)
        # the summary is supposed to be the abstract
        ds['abstract'] = ds['paper'].summary
        # in case it is not present, we try to find it
        if ds['abstract'].strip() == '':
            abs = getAbs(pdf, False)
            if abs == '':
                print(f"eliminating {ds['paper'].title} because no summary (abstract) provided or found")
                continue
        concl = getConcl(pdf, False)
        ds['conclusion'] = concl
        if concl != '': 
            dsc.append(ds)
        else:
            print(f"eliminating {ds['paper'].title} because no conclusion found")
    return dsc

def getConcl(pdf, hailMary=False):
    """
    Assume the conclusion is between "Conclusion" or "Discussion" and "References"
    This is a heuristic approach to the probelm. A ML approach might get better results.
    This works good enough for a v0.
    """
    nPages = len(pdf)
    done = False
    pIdx = nPages - 1
    conclusion = ''
    startIdx = -1
    endIdx = -1
    re1 = re.compile('\W*conclusion.*?\n.*')   # and is < MAXCONCLTLEN
    # try fo find the referenes.
    while pIdx > -1:
        page = pdf[pIdx].get_textpage().get_text_range()
        lp = page.lower()
        spx = 0
        epx = len(lp)
        while spx < epx:
            if (rpos := lp[spx:epx].find('reference')) > -1:
                if looksLikeTitle(lp, rpos, 'reference'):
                    endIdx = rpos
                    break
                else: spx += rpos + 1
            else: 
                break
            time.sleep(1)
        if endIdx == -1:
            pIdx -= 1
        else: break
    # should look fro all instances of conclusion on each page
    # should also look for 'discussion'
    if (cpos := lp[:endIdx].find('conclusion')) > -1 and looksLikeTitle(lp[:endIdx], cpos, 'conclusion'):
        # print(f"Done conclusion {cpos}: {endIdx}")
        conclusion = lp[cpos:endIdx] + conclusion
        return conclusion
    else:
        # it looks like the conclusion spans a page
        conclusion = lp[:endIdx]
        endIdx = None
        if pIdx > 0:
            pIdx -= 1
            # print(f"looking for conclusion at {pIdx}")
            page = pdf[pIdx].get_textpage().get_text_range()
            lp = page.lower()
            if (cpos := lp.find('conclusion')) > -1 and looksLikeTitle(lp[:endIdx], cpos, 'conclusion'):
                conclusion = lp[cpos:] + conclusion
                return conclusion
            elif hailMary:
                if len(conclusion) > MINCONCLLEN:
                    # print(f"Cant find conclusion, returning last chunk")
                    return conclusion
                else: return lp + conclusion
            else: return ''
        else: return conclusion

def getAbs(pdf, hailMary=False):
    """
     THe abstract is between 'Abstract' and 'Introduction'
    """
    nPages = len(pdf)
    done = False
    pIdx = 0
    abs = ''
    startIdx = -1
    endIdx = -1
    abstract = ''
    re1 = re.compile('\W*abstract\W*\n')
    while not done and pIdx < nPages:
        page = pdf[pIdx].get_textpage().get_text_range()
        lp = page.lower()
        if (ax := lp.find('abstract')) >= 0:
            m1 = reAbs.match(lp[:ax])
            startIdx = ax
            break
        elif len(page) <= MINPAGELEN:
            pIdx += 1
            continue
        else:
            if hailMary:
                # print("cannot find absract. return the whole page")
                return page
            else: return ''
    if (ix := lp[ax:].find('introduction')) >= 0:
        # assume intro is on same page
        abstract += lp[startIdx:(ax + ix)]
        return abstract
    elif pIdx + 1 < nPages:
        # looks like the abstract runs to the next page
        abstract += lp[startIdx:]
        page = pdf[pIdx + 1].get_textpage().get_text_range()
        lp = page.lower()
        if (ix := lp.find('introduction')) >= 0:
            abstract += lp[:ix]
            return abstract
        else:
            if hailMary:
                # add this page in
                abstract += lp
            return abstract
    return ''
    

def looksLikeTitle(page, pos, title):
    # we might have the keyword 'conclusion' say in the text or in a title. 
    # this is a heuristic way to figure that out.
    # the heuristic is that a title is a relatively short string on a line
    strOI = page[max(0, pos - TTLEXP): (pos + TTLEXP)]
    theRE = re.compile('([^\n]*' + title + 's?[^\n]*)', re.MULTILINE|re.DOTALL)
    while True:
        m1 = re.search(theRE, strOI)
        if not m1: break
        if len(m1[1]) <= MAXTITLELEN: return True
        strOI = strOI[m1.end():]
    return False


def queryOpenai(query: list[dict] = [],
               instr: str = 'You are a helpful assistant.',
               temperature: float= 0.0,
               ) -> (str, list[str]):
    qObj = [{"role": "system", "content": instr}]
    qObj.extend(query)
    response = openai.ChatCompletion.create(
                   model = CMODEL,
                   messages = qObj,
                   temperature = temperature)
    answers = [x['message']['content'] for x in response['choices']]
    return response, answers


def getOneSummary(abs: str='', concl:str = '')->list[str]:
    queryAC = [{"role": "user", 
                "content": f"""Given the paper abstract and conclusion below, answer the following questions. 

Questions:
1. What is the paper about?
2. What do the authors plan to show?
3. Why is it significant?
4. How do they intend to do it?
5. What are the results?

Abstract:
{abs}

Conclusion:
{concl}
"""}]
    instr = "You are a scientific researcher who has read thousands of papers and can accurately summarize the contents."
    resp, ans = queryOpenai(queryAC, instr)
    return ans[0].split('\n')

# the main method
def getArxivSummaries(topic:str = 'NER with LLMs', num:int = 10, fname:str = None):
    print('Getting references from Arxiv')
    docs = getPapers(topic, num)
    print('Downloading and parsing papers')
    docs = addChunks(docs)
    results = []
    print('Querying chat-gpt')
    for d in docs[:num]:
        pinfo = [d['paper'].title, d['paper'].entry_id, d['paper'].published.strftime('%Y-%m')]
        ans = getOneSummary(d['abstract'], d['conclusion'])
        pinfo.extend(ans)
        results.append(pinfo)
    dfr = pd.DataFrame(results, 
                       columns=['title', 'url', 'date', 'about', 'aims', 'significance', 'methods', 'results'])
    if fname is not None:
        dfr.to_csv(fname, index=False)
    display(dfr)
