# Arxiv quick survey generator

Sometimes when I want to learn more about a topic by reading some papers, it is hard to find which papers are worth reading. This script uses chatgpt to help with that.

The approach is simple:
1. Given the topic of interest, query arxiv for documents.
2. Extract the abstract and conclusion from the pdf.
3. Send them to chat-gpt to answer some basic questions about the article
4. Rresent the results in a table and save them in a csv file.

## TODO
- add the fname in the summary generated
- maintain a cache of previously seen papers and reuse if they show up again


In [1]:
import arxiv
import openai
import numpy as np
from numpy.linalg import norm
import pypdfium2 as pdfium   # used to parse the pdf
import re
import time
import os
import pandas as pd
from IPython.display import display
import random
import google.generativeai as palm
import google.generativeai.types.safety_types as safety_types
import logging

gMODEL = 'models/text-bison-001'
g_api_key = os.environ['PALM_API_KEY']
palm.configure(api_key=g_api_key)
gEMODEL = 'models/embedding-gecko-001'

SAFETY = [
    {'category': safety_types.HarmCategory.HARM_CATEGORY_DEROGATORY,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
    {'category': safety_types.HarmCategory.HARM_CATEGORY_TOXICITY,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
    {'category': safety_types.HarmCategory.HARM_CATEGORY_VIOLENCE,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
    {'category': safety_types.HarmCategory.HARM_CATEGORY_SEXUAL,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
    {'category': safety_types.HarmCategory.HARM_CATEGORY_MEDICAL,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
    {'category': safety_types.HarmCategory.HARM_CATEGORY_DANGEROUS,
     'threshold': safety_types.HarmBlockThreshold.BLOCK_NONE,
    },
]
axClient = arxiv.Client()

EMODEL = "text-embedding-ada-002"  # embedding model is openai
#CMODEL = "gpt-3.5-turbo"  # chat model
CMODEL = "gpt-4"
SEARCHMULT = 2  # multiply this by the desired number of papers for search
PDFCACHEDIR = '/tmp/pdfs/'
os.makedirs(PDFCACHEDIR, exist_ok=True)
SIMTHRESHOLD = 0.5

DEBUG = 1

# Extracting conclusions and possibly abstracts are done with heuristic methods that 
# use these parameters
MAXTITLELEN = 50 # title of conclusion can be as long as this
MINCONCLLEN = 1000  # epect conclusions to be longer than these many chars
TTLEXP = 100  # to verify we have a section title, lookahead and back

RETRIES = 3  # rety getting papers

BASEDIR = os.getcwd()
LOGFILE = os.path.join(BASEDIR, 'log/genSurvey.log')

logging.basicConfig(format='%(message)s',
                    filename=LOGFILE, 
                    level=logging.DEBUG)


def getPapers(uquery: str = 'tree search', 
              num: int = 10, 
              startDate: str = None,
              endDate: str = None,
              llm: str = 'google', 
              simThreshold: float = SIMTHRESHOLD) -> list[dict()]:
    """
    - Queries arxiv for the topic of interest
    - **the query is a comma-separated list of terms**
    - **terms will be searched for in the abstract and we get the conjunction**
    - finds cosine similarity between the summary and the query string
    - returns a list of {paper, cosineSim} in order of decreasing sim
    TBD: whether this ordering is differnet from the arxiv ordering
    """
    logging.info(f"getPapers, query: {uquery}, num: {num}")
    terms = uquery.split(',')
    query = f"abs:{terms[0].strip()}"
    for t in terms[1:]:
        query += f" AND abs:{t.strip()}"
    if startDate is not None and endDate is not None:
        axquery = f"ti:{query} AND submittedDate:[{startDate} TO {endDate}]"
    else:
        axquery = query
    logging.debug(f"arxiv query: {axquery}")
    print(f"arxiv query: {axquery}")
    try:
        search = arxiv.Search(
          query = axquery,
          max_results = num * SEARCHMULT,
          sort_by = arxiv.SortCriterion.Relevance
        )
    except Exception as e:
        logging.error(f"Cannot get arxiv data\n{str(e)}")
        print(f"Cannot get arxiv data\n{str(e)}")
        return []
    results = axClient.results(search)
    logging.debug(f"Arxiv returns")
    qel = getEmbedding(query, llm)
    qEmb = np.array(qel)
    logging.debug(f"Embedding: {str(qEmb.shape)}")
    qNorm = norm(qEmb)
    docNscores = []
    for r in results:
        rs = {'paper': r}
        logging.debug(f"Proessing {r.title}")
        pEmb = np.array(getEmbedding(r.summary, llm))
        #pEmb = np.array(openai.Embedding.create(input = r.summary,
        #                                 model = EMODEL)['data'][0]['embedding'])
        rs['score'] = np.dot(pEmb, qEmb)/(qNorm * norm(pEmb))
        if rs['score'] >= simThreshold:
            docNscores.append(rs)
    docNscores.sort(key=lambda x: x['score'], reverse=True)
    return docNscores

def getEmbedding(query:str = '',
                 llm:str = 'palm')->list[float]:
    logging.debug(f"Embedding {query}")
    if llm == 'openai':
        qel = openai.Embedding.create(input=query, 
                                      model=EMODEL)['data'][0]['embedding']
    elif llm == 'google':
        qel = palm.generate_embeddings(model = gEMODEL, text = query)['embedding']
    else:
        print('Unknown embedding')
        qel= None
    return qel

def addChunks(docNscores: list[dict()],
              noConclusionsOK: bool = False,
              dumpDir:str = PDFCACHEDIR,) -> list[dict()]:
    """
     Adds abstract and conclusion to the input data
    """
    logging.info("addChunks")
    dsc = []
    success = False
    cnt = 0
    for ds in docNscores:
        cnt = 0
        success = False
        fname = os.path.join(dumpDir, ds['paper']._get_default_filename())
        if os.path.isfile(fname):
            success = True
        while not success and cnt < RETRIES:
            cnt += 1
            try:
                fname = ds['paper'].download_pdf(dumpDir)
                logging.debug(f"Done download {ds['paper']}")
                success = True
            except Exception as e:
                print(e)
            time.sleep(0.5 + random.random() * 2)
        if not success:
            logging.error(f"Cannot download {ds['paper'].title}")
            continue
        pdf = pdfium.PdfDocument(fname)
        # the summary is supposed to be the abstract
        ds['abstract'] = ds['paper'].summary
        # in case it is not present, we try to find it
        if ds['abstract'].strip() == '':
            abs = getAbs(pdf, False)
            if abs == '':
                print(f"eliminating {ds['paper'].title} because no summary (abstract) provided or found")
                continue
        concl = getConcl(pdf, False)
        ds['conclusion'] = concl
        if concl != '' or noConclusionsOK: 
            dsc.append(ds)
        else:
            print(f"eliminating {ds['paper'].title} because no conclusion found")
    return dsc

def getConcl(pdf, hailMary=False):
    """
    Assume the conclusion is between "Conclusion" or "Discussion" and "References"
    This is a heuristic approach to the probelm. A ML approach might get better results.
    This works good enough for a v0.
    """
    logging.info('Trying to find conclusion from pdf')
    nPages = len(pdf)
    done = False
    pIdx = nPages - 1
    conclusion = ''
    startIdx = -1
    endIdx = -1
    re1 = re.compile('\W*conclusion.*?\n.*')   # and is < MAXCONCLTLEN
    # try fo find the referenes.
    while pIdx > -1:
        page = pdf[pIdx].get_textpage().get_text_range()
        lp = page.lower()
        spx = 0
        epx = len(lp)
        while spx < epx:
            if (rpos := lp[spx:epx].find('reference')) > -1:
                if looksLikeTitle(lp, rpos, 'reference'):
                    endIdx = rpos
                    break
                else: spx += rpos + 1
            else: 
                break
            time.sleep(1)
        if endIdx == -1:
            pIdx -= 1
        else: break
    # should look fro all instances of conclusion on each page
    # should also look for 'discussion'
    if (cpos := lp[:endIdx].find('conclusion')) > -1 and looksLikeTitle(lp[:endIdx], cpos, 'conclusion'):
        # print(f"Done conclusion {cpos}: {endIdx}")
        conclusion = lp[cpos:endIdx] + conclusion
        return conclusion
    else:
        # it looks like the conclusion spans a page
        conclusion = lp[:endIdx]
        endIdx = None
        if pIdx > 0:
            pIdx -= 1
            # print(f"looking for conclusion at {pIdx}")
            page = pdf[pIdx].get_textpage().get_text_range()
            lp = page.lower()
            if (cpos := lp.find('conclusion')) > -1 and looksLikeTitle(lp[:endIdx], cpos, 'conclusion'):
                conclusion = lp[cpos:] + conclusion
                return conclusion
            elif hailMary:
                if len(conclusion) > MINCONCLLEN:
                    # print(f"Cant find conclusion, returning last chunk")
                    return conclusion
                else: return lp + conclusion
            else: return ''
        else: return conclusion

def getAbs(pdf, hailMary=False):
    """
     THe abstract is between 'Abstract' and 'Introduction'
    """
    logging.info('Trying to find abstract from pdf')
    nPages = len(pdf)
    done = False
    pIdx = 0
    abs = ''
    startIdx = -1
    endIdx = -1
    abstract = ''
    re1 = re.compile('\W*abstract\W*\n')
    while not done and pIdx < nPages:
        page = pdf[pIdx].get_textpage().get_text_range()
        lp = page.lower()
        if (ax := lp.find('abstract')) >= 0:
            m1 = reAbs.match(lp[:ax])
            startIdx = ax
            break
        elif len(page) <= MINPAGELEN:
            pIdx += 1
            continue
        else:
            if hailMary:
                # print("cannot find absract. return the whole page")
                return page
            else: return ''
    if (ix := lp[ax:].find('introduction')) >= 0:
        # assume intro is on same page
        abstract += lp[startIdx:(ax + ix)]
        return abstract
    elif pIdx + 1 < nPages:
        # looks like the abstract runs to the next page
        abstract += lp[startIdx:]
        page = pdf[pIdx + 1].get_textpage().get_text_range()
        lp = page.lower()
        if (ix := lp.find('introduction')) >= 0:
            abstract += lp[:ix]
            return abstract
        else:
            if hailMary:
                # add this page in
                abstract += lp
            return abstract
    return ''
    

def looksLikeTitle(page, pos, title):
    # we might have the keyword 'conclusion' say in the text or in a title. 
    # this is a heuristic way to figure that out.
    # the heuristic is that a title is a relatively short string on a line
    strOI = page[max(0, pos - TTLEXP): (pos + TTLEXP)]
    theRE = re.compile('([^\n]*' + title + 's?[^\n]*)', re.MULTILINE|re.DOTALL)
    while True:
        m1 = re.search(theRE, strOI)
        if not m1: break
        if len(m1[1]) <= MAXTITLELEN: return True
        strOI = strOI[m1.end():]
    return False

# should use queryLLM.ipynb
def queryOpenai(query: list[dict] = [],
               instr: str = 'You are a helpful assistant.',
               temperature: float= 0.0,
               ) -> (str, list[str]):
    qObj = [{"role": "system", "content": instr}]
    qObj.extend(query)
    response = openai.ChatCompletion.create(
                   model = CMODEL,
                   messages = qObj,
                   temperature = temperature)
    answers = [x['message']['content'] for x in response['choices']]
    return response, answers


def getOpenaiSummary(abs: str='', concl:str = '')->list[str]:
    queryAC = [{"role": "user", 
                "content": f"""Given the paper abstract {'and conclusion ' if len(concl) > 10 else ''}below, answer the following questions. 

Questions:
1. What is the paper about?
2. What do the authors plan to show?
3. Why is it significant?
4. How do they intend to do it?
5. What are the results?

Abstract:
{abs}

{'Conclusion: ' + f'{chr(10)}{concl}' if len(concl) > 10 else ''}
"""}]
    instr = "You are a scientific researcher who has read thousands of papers and can accurately summarize the contents."
    resp, ans = queryOpenai(queryAC, instr)
    return re.split('\n+', ans[0])

def queryPalmText(query: str,
                  temperature: float = 0.0,
                  model: str = gMODEL,
                  fobj = None) -> (str, list[str]):
    queryObj = query
    palmResp = palm.generate_text(
        model = gMODEL,
        prompt = query,
        temperature = temperature,
        candidate_count = 1,
        safety_settings = SAFETY
    )
    answers = [x['output'] for x in palmResp.candidates]
    if fobj is not None:
        fobg.write(f"\n{'-'*10}\nPROMPT: {query}\nTEMP: {temperature}\n")
        for a in answers:
            fobj.write(f"\n--\n{a}")
        fobj.flush()
    return palmResp, answers

def getGoogleSummary(abs:str = '', concl:str = ''):
    query = f"""Given the paper abstract {'and conclusion ' if len(concl) > 10 else ''}below, answer the following questions. 

Questions:
1. What is the paper about?
2. What do the authors plan to show?
3. Why is it significant?
4. How do they intend to do it?
5. What are the results?

Abstract:
{abs}

{'Conclusion: ' + f'{chr(10)}{concl}' if len(concl) > 10 else ''}
"""
 
    response, answers = queryPalmText(query)
    if DEBUG > 0:
        print('RESPONSE\n', response)
        print('\n----\nANSWERS\n', answers) 
    if len(answers) > 0:
        return re.split('\n+', answers[0])
    else: return None

    
# the main method
def getArxivSummaries(topic:str = 'LLM',  # comma separated terms
                      num:int = 10, 
                      llm: str = 'openai',  # 'google' or 'openai'
                      startDate: str = None, # YYYYMMDD
                      endDate: str = None,
                      noConclusionsOK: bool = True,
                      pdfCacheDir: str = None, 
                      fname:str = None,   # uses default name if None
                     ): 
    if pdfCacheDir is None:
        pdfCacheDir = PDFCACHEDIR
    if startDate is not None and endDate is not None:
        dumpDir = os.path.join(pdfCacheDir, f"{topic.replace(' ', '_')}_{startDate}_{endDate}")
    else:
        dumpDir = os.path.join(pdfCacheDir, f"{topic.replace(' ', '_')}")
    if os.path.exists(dumpDir):
        dumpDir += f"_{random.randint(0, 1000)}"
    logging.info(f"Main. query: {topic}, numer: {num}, dumpdir: {dumpDir}")
    print(f"Main. query: {topic}, numer: {num}, dumpdir: {dumpDir}")

    os.makedirs(dumpDir, exist_ok=True)
    if fname is None:
        fname = os.path.join(dumpDir, "summaries.csv")
    print('Getting references from Arxiv')
    docs = getPapers(topic, num, startDate=startDate, endDate=endDate)
    print(f"Num papers found: {len(docs)}")
    print('Downloading and parsing papers')
    docs = addChunks(docs, noConclusionsOK, dumpDir)
    results = []
    print('Querying llm')
    for d in docs[:num]:
        logging.info(f"Summarizing {d['paper'].title[:40]}")
        print(f"Summarizing {d['paper'].title[:40]}")
        pinfo = [d['paper'].title, d['paper'].entry_id, d['paper'].published.strftime('%Y-%m')]
        if llm == 'openai':
            ans = getOpenaiSummary(d['abstract'], d['conclusion'])
        elif llm == 'google':
            ans = getGoogleSummary(d['abstract'], d['conclusion'])
        else:
            logging.error('Unknoen llm')
            print('Unknown llm')
            return
        if ans is None:
            continue
        pinfo.extend(ans)
        if len(ans) != 5:
            logging.error(f"chatgpt answer not right length")
            print('chatgpt answer not right length')
            print(pinfo)
        else:
            results.append(pinfo)
    try:
        dfr = pd.DataFrame(results, 
                           columns=['title', 'url', 'date', 'about', 'aims', 'significance', 'methods', 'results'])
        if fname is not None:
            dfr.to_csv(fname, index=False)
    except Exception as e:
        dfr = None
        try:
            jres = json.dumps(results)
            dumpRes = jres
        except:
            logging.error('cannot serialize to json')
            print('cannot serialize to json')
            dumpRes = str(results)
        with open(fname, 'w') as ox:
            ox.write(dumpRes)
        logging.info(f"Dumped results to {fname}")
        print(f"Dumped results to {fname}")
    if not dfr is None:
        display(dfr)

print('Done')

Done


In [2]:
getArxivSummaries(topic = 'essay scoring', 
                  num = 30, 
                  llm = 'openai', 
                  startDate = None,
                  endDate = None,
                  noConclusionsOK = True,
                  pdfCacheDir = '/Users/kp/projects/documents/genSummary',
                  fname = None)

Main. query: essay scoring, numer: 30, dumpdir: /Users/kp/projects/documents/genSummary/essay_scoring
Getting references from Arxiv
arxiv query: abs:essay scoring
Num papers found: 60
Downloading and parsing papers
Querying llm
Summarizing An Automated System for Essay Scoring of
Summarizing Automated assessment of non-native learn
Summarizing Many Hands Make Light Work: Using Essay 
Summarizing Toward Educator-focused Automated Scorin
Summarizing Automatic Essay Scoring in a Brazilian S
Summarizing Cognitively Aided Zero-Shot Automatic Es
Summarizing Transformer-based Joint Modelling for Au
Summarizing Improving Performance of Automated Essay
Summarizing H-AES: Towards Automated Essay Scoring f
Summarizing Automated Topical Component Extraction U
Summarizing Prompt- and Trait Relation-aware Cross-p
Summarizing Automated essay scoring with string kern
Summarizing My Teacher Thinks The World Is Flat! Int
Summarizing Corruption Is Not All Bad: Incorporating
Summarizing DREsS: Dataset for

Unnamed: 0,title,url,date,about,aims,significance,methods,results
0,An Automated System for Essay Scoring of Onlin...,http://arxiv.org/abs/1611.02815v1,2016-11,1. The paper is about an automated system for ...,2. The authors plan to show that their propose...,3. The significance of this paper lies in its ...,4. The authors have developed an online exam b...,5. The results of the paper indicate that the ...
1,Automated assessment of non-native learner ess...,http://arxiv.org/abs/1612.00729v1,2016-12,1. The paper is about Automatic Essay Scoring ...,2. The authors plan to show which specific lin...,3. The study is significant because while AES ...,4. The authors intend to do this by modeling t...,5. The results show that the feature set used ...
2,Many Hands Make Light Work: Using Essay Traits...,http://arxiv.org/abs/2102.00781v1,2021-02,1. The paper is about the use of multi-task le...,2. The authors aim to demonstrate that their M...,3. The research is significant because it offe...,4. The authors use a multi-task learning appro...,5. The results show that the MTL-based BiLSTM ...
3,Toward Educator-focused Automated Scoring Syst...,http://arxiv.org/abs/2112.11973v1,2021-12,1. The paper is about improving automated essa...,2. The authors plan to show how neural network...,3. The significance of this paper lies in its ...,4. The authors intend to do this by employing ...,5. The results of the paper are not explicitly...
4,Automatic Essay Scoring in a Brazilian Scenario,http://arxiv.org/abs/2401.00095v1,2023-12,1. The paper is about a new Automatic Essay Sc...,2. The authors plan to show that their AES alg...,3. The significance of this research lies in i...,4. They intend to do this by leveraging advanc...,5. The abstract does not provide specific resu...
5,Cognitively Aided Zero-Shot Automatic Essay Gr...,http://arxiv.org/abs/2102.11258v1,2021-02,1. The paper is about the problem of zero-shot...,2. The authors aim to demonstrate that using g...,3. The significance of this research lies in i...,4. The authors intend to achieve their goal by...,5. The results of the experiments show that us...
6,Transformer-based Joint Modelling for Automati...,http://arxiv.org/abs/2404.08655v1,2024-03,1. The paper is about improving Automated Essa...,2. The authors plan to show that their propose...,3. The study is significant because it address...,4. The authors intend to do this by proposing ...,5. The results show that the proposed method o...
7,Improving Performance of Automated Essay Scori...,http://arxiv.org/abs/2203.00354v2,2022-03,1. The paper is about improving the performanc...,2. The authors plan to show that their propose...,3. The significance of this paper lies in its ...,4. The authors intend to achieve their goal by...,5. The results showed that the performance of ...
8,H-AES: Towards Automated Essay Scoring for Hindi,http://arxiv.org/abs/2302.14635v1,2023-02,1. The paper is about the application of Natur...,2. The authors aim to demonstrate that AES can...,3. The study is significant because AES in Hin...,4. The authors train and evaluate their models...,5. The results of the study show that the auth...
9,Automated Topical Component Extraction Using N...,http://arxiv.org/abs/2008.01809v1,2020-08,1. The paper is about linking automated writin...,"2. The authors plan to show that their method,...",3. The significance of this paper lies in its ...,4. They intend to do this by using the attenti...,5. The results show that T-Cattn outperforms a...
