In [1]:
import json
import re
import numpy as np
import anthropic
import requests
from bs4 import BeautifulSoup
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from markdownify import markdownify as md
import yaml



In [2]:
ads_token = "6pmanBZytaNltPsonmdbJATGnDZO7mAxluAxgYfz"

In [264]:
from urllib.parse import urlencode, quote_plus

query = {"q": "bibstem:ara&a", "fl": "title, year, bibcode, identifier", "rows":1000}

encoded_query = urlencode(query)

results = requests.get("https://api.adsabs.harvard.edu/v1/search/query?{}".format(encoded_query), \
                       headers={'Authorization': 'Bearer ' + ads_token})

json_results = results.json()

['10.48550/arXiv.0909.0948',
 '2009arXiv0909.0948A',
 'arXiv:0909.0948',
 '2009ARA&A..47..481A',
 '10.1146/annurev.astro.46.060407.145222']

In [265]:
with open('araa.json', 'w') as f:
    json.dump(json_results, f)

In [3]:
with open('araa.json', 'r') as f:
    json_results = json.load(f)

In [53]:
def pull_arxiv_and_doi(idlist):
    arXiv_pattern = r'arXiv:\d{4}\.\d{4}'
    arxiv, doi = "", ""
    for item in idlist:
        if re.match(arXiv_pattern, item):
            arxiv = item.split('arXiv:')[1]
        elif '10.1146/annurev' in item:
            doi = item
    return arxiv, doi

In [54]:
def format_reviews(json_docs, cutoff = 2000):
    all_reviews = []
    for result in json_docs:
        if int(result['year']) > cutoff:
            arxiv, doi = pull_arxiv_and_doi(result['identifier'])
            if doi != "" and arxiv != "":
                url = "https://www.annualreviews.org/content/journals/" + doi
                all_reviews.append({'title': result['title'][0], "id": arxiv, 'url': url, })
    return all_reviews

In [55]:
reviews = format_reviews(json_results['response']['docs'])

In [190]:
review_list = ["""{ "title": "Circumbinary Accretion: From Binary Stars to Massive Binary Black Holes",
                "id": "2211.00028",
                "url": "https://www.annualreviews.org/content/journals/10.1146/annurev-astro-052622-022933"} """,
                """{ "title": "Hydrodynamical Simulations of the Galaxy Population: Enduring Successes and Outstanding Challenges",
                "id": "2309.17075",
                "url": "https://www.annualreviews.org/content/journals/10.1146/annurev-astro-041923-043618"}"""]

In [191]:
reviews = [json.loads(text) for text in review_list]

In [322]:
!ls

araa.json             [34mcomputer-vision[m[m       [34mraptor[m[m
araa_evals.ipynb      [34mir-tutorial-notebooks[m[m
[34margubot[m[m               [34mjhu-mt-hw[m[m


In [8]:
with open("/users/christineye/retrieval/config.yaml", 'r') as stream:
    api_key = yaml.safe_load(stream)['anthropic_api_key']

In [9]:
client = anthropic.Anthropic(api_key = api_key)

def generate_query(text):
    message = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=100,
        temperature=0,
        system="""You are an expert astronomer. Given a paragraph from a scientific paper, return a specific research query that the paragraph answers.
                    Formulate the question such that it is focused and concise, but covers all topics in the paragraph. 
                    Be concise and return just the question.""",
        messages=[{ "role": "user", 
                   "content": [{   "type": "text",  "text": text, } ]}]
    )
    
    return message.content[0].text

In [24]:
generate_query(review["text"])

'What do numerical simulations reveal about the formation and evolution of circumbinary accretion disks around binary massive black holes following galaxy mergers?'

In [10]:
import requests

def download_html(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
    req.add_header('Accept-Language', 'en-US,en;q=0.5')

    r = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(r, features="html.parser")
    
    return soup.get_text()

In [11]:
nest_asyncio.apply()

async def fetch_page_content(url):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=False)  # Set headless=True if you don't need a browser UI
        context = await browser.new_context()
        page = await context.new_page()

        await page.goto(url)
        await page.wait_for_load_state('networkidle')

        content = await page.content()

        await browser.close()
        
        soup = BeautifulSoup(content, 'html.parser')
        
        paragraphs = [md(p.text) for p in soup.find_all('p')]
        
        return paragraphs

async def fetch_multiple_pages(urls):
    tasks = [fetch_page_content(url) for url in urls]
    return await asyncio.gather(*tasks)

def get_page_contents(reviews):
    urls = [review["url"] for review in reviews]
    return asyncio.run(fetch_multiple_pages(urls))

In [12]:
def scrape_citations(text):
    patterns = ['([A-Z][a-z´]+)\s+(\d{4})', # Name Year
                '([A-Z][a-z´]+) et al\. (\d{4})', # Name et al. Year
                '([A-Z][a-z´]+) et al\. \((\d{4})\)', # Name et al. (Year)
                '([A-Z][a-z´]+) & ([A-Z][a-z]+) (\d{4})', # Name & Name Year
                '([A-Z][a-z´]+) & ([A-Z][a-z]+) \((\d{4})\)',
                '([A-Z][a-z´]+),\s+([A-Z][a-z]+) & ([A-Z][a-z]+) (\d{4})']
    
    citations = []
    for pattern in patterns:
        for match in re.findall(pattern, text):
            citations.append(match)
    
    return citations

In [13]:
def citation_density(content, k, mode = "topk", maxn = 12):
    num_citations = np.array([len(scrape_citations(p)) for p in content])
    
    if mode == "topk":
        indices = np.flip(np.argsort(num_citations))[:k]
    elif mode == "threshold":
        mask = np.logical_and(num_citations >= k, num_citations < maxn)
        indices = np.arange(len(content))[mask]
    
    return np.sort(indices)

In [14]:
def get_best_paragraphs(content, k, mode = "threshold"):
    indices = citation_density(content, k, mode)
    string = ""
    
    for index in indices:
        string += str(index)
        string += ": "
        string += content[index]
        string += "\n\n"
    
    return string

In [137]:
def claude_paragraphs(paper):
    message = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=500,
        temperature=0,
        system="""You are an expert astronomer. Given this list of paragraphs from a scientific paper, generate a focused research question for each paragraph.
                Formulate the question such that it is focused and concise, but covers all topics in the paragraph. 
                Then assess which paragraphs are most on-topic and closely related to their research question.
                If the question has multiple sub-questions, a good and focused paragraph shoudl address all of them.
                Return the 3 best question-paragraph pairs in this format: (index, question).
                Do not include any text before or after each (index, question), including any introduction or rationale.""",
                # Also return the 3 paragraphs and corresponding questions that are least on-topic and related to the research question.
        messages=[{"role": "user",
                "content": [{"type": "text", "text": paper,}] }]
    )
    
    return message

In [30]:
len(reviews) # only after 2000; only with arXiv

161

In [75]:
batch_size = 5

for i in range(len(reviews) // batch_size + 1)[29:]:
    print(i)
    batch = reviews[i * batch_size : i * batch_size + (batch_size - 1)]
    content = get_page_contents(batch)

    for j, paper in enumerate(content):
        if "institutional or personal subscription" not in paper[-1]:
            review = reviews[i * batch_size + j].copy()
            review['text'] = paper
            reviews_with_text.append(review)

29


Task exception was never retrieved
future: <Task finished name='Task-2622' coro=<Connection.run() done, defined at /Users/christineye/anaconda3/lib/python3.9/site-packages/playwright/_impl/_connection.py:265> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/Users/christineye/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/kg/0g3xrys53nd6r_vs_7tph9y00000gn/T/ipykernel_4675/2564133765.py", line 7, in <module>
    content = get_page_contents(batch)
  File "/var/folders/kg/0g3xrys53nd6r_vs_7tph9y00000gn/T/ipykernel_4675/4225371177.py", line 28, in get_page_contents
    return asyncio.run(fetch_multiple_pages(urls))
  File "/Users/christineye/anaconda3/lib/python3.9/site-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
  File "/Users/christineye/anaconda3/lib/python3.9/site-packages/nest_asyncio.py", line 92, 

30
31
32


In [78]:
with open('araa_papers.json', 'w') as f:
    json.dump(reviews_with_text, f)

In [37]:
reviews[:10]

[{'title': 'The Chemical Composition of the Sun',
  'id': '0909.0948',
  'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev.astro.46.060407.145222'},
 {'title': 'Coevolution (Or Not) of Supermassive Black Holes and Host Galaxies',
  'id': '1304.7762',
  'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev-astro-082708-101811'},
 {'title': 'Cosmic Star-Formation History',
  'id': '1403.0007',
  'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev-astro-081811-125615'},
 {'title': 'Star Formation in the Milky Way and Nearby Galaxies',
  'id': '1204.3552',
  'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev-astro-081811-125610'},
 {'title': 'The Circumgalactic Medium',
  'id': '1709.09180',
  'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev-astro-091916-055240'},
 {'title': 'Observational Evidence of Active Galactic Nuclei Feedback',
  'id': '1204.4114',
  'url': 'https://www.annualreviews.o

In [236]:
message = claude_paragraphs(get_best_paragraphs(contents[1], 5))

In [245]:
message.content[0].text.split('\n\n')[1]

'(9, How have cosmological simulations of galaxy formation improved over time in reproducing observed galaxy properties, and what key challenges remain?)'

In [238]:
print(message.content[0].text)

Here are the 3 best question-paragraph pairs based on relevance and focus:

(9, How have cosmological simulations of galaxy formation improved over time in reproducing observed galaxy properties, and what key challenges remain?)

(55, How do cosmological simulations model the multiphase structure of the interstellar medium given resolution limitations?)

(151, What are the main limitations of current state-of-the-art cosmological simulations of galaxy formation, and what key physical processes are still neglected?)


In [164]:
papers[1]

{'title': 'Theory of Star Formation',
 'id': '0707.3514',
 'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev.astro.45.051806.110602',
 'text': ['We use cookies to track usage and preferences.I Understand',
  'We review current understanding of star formation, outlining an overall theoretical framework and the observations that motivate it. A conception of star formation has emerged in which turbulence plays a dual role, both creating overdensities to initiate gravitational contraction or collapse, and countering the effects of gravity in these overdense regions. The key dynamical processes involved in star formation—turbulence, magnetic fields, and self-gravity—are highly nonlinear and multidimensional. Physical arguments are used to identify and explain the features and scalings involved in star formation, and results from numerical simulations are used to quantify these effects. We divide star formation into large-scale and small-scale regimes and review each in 

In [None]:
# ask it to consider time?

In [251]:
def process_paragraphs(content):
    paragraphs = get_best_paragraphs(content, 5)
    message = claude_paragraphs(paragraphs)
    message = message.content[0].text
    
    results = []
    for pair in message.split('\n\n'):
        if '(' in pair:
            index, question = pair[1:-1].split(',', 1)
            paragraph = content[int(index)]
            results.append({'question': question, 'paragraph': paragraph, 'citations': scrape_citations(paragraph)})
    
    return results

In [145]:
len(set(scrape_citations(papers[0]['text'][93])))

9

In [59]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
import multi_paper

In [79]:
papers = multi_paper.load_papers(from_file = False)

In [157]:
print('Number of papers:', len(papers))

test_papers = papers[:5]
query_pairs = []
for paper in test_papers:
    query_pairs.append(multi_paper.process_paper(paper))

Number of papers: 86


In [158]:
query_pairs

[[{'title': 'Cosmic Star-Formation History',
   'id': '1403.0007',
   'question': ' What is the most appropriate FUV conversion factor for estimating star formation rates across cosmic time, and how does it compare to commonly used values in the literature?',
   'paragraph': 'In this review, we adopt a constant FUV conversion factor \n\n year−1 erg−1 s Hz (we typically leave out the units) as a compromise value based on the evolutionary scenario from \nFigure 3\n. The widely used value from Kennicutt (1998) (and based on the calibration by Madau et al. 1998b), \n\n, is 20% larger than our calibration. Other recent analyses based on the GALAXEV libraries have also found lower mean conversion factors, for both low- and high-redshift galaxy populations (e.g., Salim et al. 2007, Haardt & Madau 2012). The FUV conversion tabulated in Kennicutt & Evans (2012) (from Murphy et al. 2011), if rescaled from the Kroupa to Salpeter IMF, is very close to the z = 0 solar metallicity value of \n\n in \

In [159]:
with open('../data/multi_paper_examples.json', 'w') as json_file:
    for paper in query_pairs:
        for entry in paper:
            partial_json = json.dumps({k: v for k, v in entry.items() if k != 'citations'}, indent=2)
            citations_json = json.dumps(list(entry['citations']), separators=(',', ':'))
            combined_json = partial_json.rstrip('}') + ',"citations": ' + citations_json + '\n}'
            json_file.write(combined_json)
            json_file.write('\n')

In [160]:
papers[0]

{'title': 'Cosmic Star-Formation History',
 'id': '1403.0007',
 'url': 'https://www.annualreviews.org/content/journals/10.1146/annurev-astro-081811-125615',
 'text': ['We use cookies to track usage and preferences.I Understand',
  "Over the past two decades, an avalanche of new data from multiwavelength imaging and spectroscopic surveys has revolutionized our view of galaxy formation and evolution. Here we review the range of complementary techniques and theoretical tools that allow astronomers to map the cosmic history of star formation, heavy element production, and reionization of the Universe from the cosmic “dark ages” to the present epoch. A consistent picture is emerging, whereby the star-formation rate density peaked approximately 3.5 Gyr after the Big Bang, at z≈1.9, and declined exponentially at later times, with an e-folding timescale of 3.9 Gyr. Half of the stellar mass observed today was formed before a redshift z = 1.3. About 25% formed before the peak of the cosmic star-

In [312]:
reviews_with_text = multi_paper.scrape_all_papers(reviews[10:11])

100%|█████████████████████████████████████████████| 1/1 [00:40<00:00, 40.46s/it]


In [605]:
reviews_with_text[0]['text']

['We use cookies to track usage and preferences.I Understand',
 'We describe ongoing searches for intermediate-mass black holes with MBH ≈ 10–105 M⊙. We review a range of search mechanisms, both dynamical and those that rely on accretion signatures. We find the following conclusions: \n',
 '',
 'Article metrics loading...',
 'We describe ongoing searches for intermediate-mass black holes with M\nBH ≈ 10–105 M⊙. We review a range of search mechanisms, both dynamical and those that rely on accretion signatures. We find the following conclusions: \n\n',
 '',
 'Intermediate-mass black holes (IMBHs) are often introduced as what they are not. They are not stellar-mass black holes, which are formed in the deaths of massive stars, and are historically thought to be ∼10 M⊙ (Remillard & McClintock 2006). They are not supermassive black holes, which are historically considered to have masses of 106–1010 M⊙. The question is often framed thus: Are there black holes with masses between these two cla

In [614]:
citations = multi_paper.scrape_citations(reviews_with_text[0]['text'][9])

In [615]:
citations

[('Metzger', '2016'),
 ('Velzen', '2018'),
 ('Amaro-Seoane', '2015'),
 ('MacLeod', '2016'),
 ('Eracleous', '2019'),
 ('Maksym', '2013'),
 ('Wevers', '2017'),
 ('Stone', 'Metzger', '2016'),
 ('MacLeod', '2016')]

In [473]:
def link_to_arxiv(review, citations):
    cited_refs = []
    for citation in set(citations):
        for ref in review['fullbib']:
            if ref['year'] == citation[-1]:
                if set(citation[:-1]).issubset(ref['surnames']) or citation[0] == ref['collab']:
                    cited_refs.append(ref)
    
    return cited_refs

In [617]:
cited_refs = link_to_arxiv(reviews_with_text[0], citations)

In [350]:
import arxiv
arxiv_client = arxiv.Client()

In [686]:
def search_arxiv(ref):
    #print(ref)
    query = ""
    for i, surname in enumerate(ref['surnames']):
        if '-' not in surname: query += "au:"
        query += surname.replace("'","")
        
        if i != len(ref['surnames']) - 1: query += " AND "
    #query += ref['year'] doesn't work super well
    search = arxiv.Search(query = query, max_results = 10)
    results = arxiv_client.results(search)
    
    for r in results:
        
        valid = True
        if r.published.year > int(ref['year']) + 2 or r.published.year < int(ref['year']) - 2:
            continue
        for i, surname in enumerate(ref['surnames']):
            if surname not in r.authors[i].name:
                valid = False
                break
        if valid: return r.entry_id.split('/')[-1]
    return None

In [687]:
cited_refs[5]

{'surnames': ['Amaro-Seoane', 'Gair', 'Pound', 'Hughes', 'Sopuerta'],
 'given_names': ['P', 'JR', 'A', 'SA', 'CF'],
 'year': '2015',
 'source': 'J. Phys. Conf. Ser.',
 'collab': None}

In [688]:
search_arxiv(cited_refs[2])

'http://arxiv.org/abs/1307.6556v1'

In [674]:
for r in arxiv_client.results(arxiv.Search(query="Amaro-Seoane AND au:Gair AND au:Pound AND au:Hughes AND au:Sopuerta")):
    print(r)

http://arxiv.org/abs/1410.0958v1


In [None]:
# special characters 
# dashes