In [19]:
import json
import pandas as pd
import numpy as np
import aiohttp
import asyncio
from requests import get

In [20]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

async def async_map_get(urls):
    resp = []
    async with aiohttp.ClientSession() as session:
        responses = [session.get(url) for url in urls]
        for chunk in chunks(responses, 95):
            resp.extend(await asyncio.gather(*chunk))
            await asyncio.sleep(60*5 + 10)
    return [await r.json() for r in resp]

def paper_url(paper_id, fields):
    return f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={','.join(fields)}"

In [21]:
# initial papers
n_papers = 30
offset = 1040
url = f"https://api.semanticscholar.org/graph/v1/paper/search?query=machine+learning&limit={n_papers}&offset={offset}&fields=paperId,title"
response = get(url)
print(response)
initial_papers = json.loads(response.content)
ids = np.array([paper["paperId"] for paper in initial_papers["data"]])
print(len(ids))

<Response [200]>
30


array(['bf79c966b293dbc5551de9785a696c099dff355b',
       '86b1a0e7f9f778ba6e5a6c547d70dfebfa45ed95',
       '0228810a988f6b8f06337e14f564e2fd3f6e1056',
       '60e28c7da56eb61dd8ddb710a6f079ef02668014',
       'c4ec5dc7d68d858e141113feca9921c632b3b2d5',
       '56703e0ccba03378962f5006f299cd98d48198f9',
       '3cb54f04765c19e7e0580196c29c64e49f63a744',
       'd3f788ee95e16dac7a2e4d65aa095199bbc3439f',
       '5194b668c67aa83c037e71599a087f63c98eb713',
       '9c7f4412b8f0310a91334aed79b8553b2ad70908',
       'a39398f68ae7e042f2ef5009e31b4e6a20fd5736',
       '6989e13df80edfc6e638e8d8502cb0739d494ca6',
       '89b22325e7d72d11c5bad8f3893d45d0e184fa9b',
       '6965c8e2dcd6ecf2a5f3f9320de5fced37391e42',
       'b6b743de242a2987b3ed0349d90971fdf7ed2faf',
       '05e5a4c51b6df1ffa8d9b76f88ad0d2c92f4627a',
       '5c89852b90a1e9e506d237749c745bf42ac0f737',
       '0f910174d2e19101ca8f008909006e79416821fd',
       'd26a48aff2abc3460c1018d5b410766f698d696c',
       '04f67e55a636b9053ddc30f

In [22]:
urls = [paper_url(id, ["title", "references", "embedding"]) for id in ids]
initial_paper_data = [get(url).json() for url in urls]

In [23]:
citations = []
all_ids = set()
for paper in initial_paper_data:
    all_ids.add(paper["paperId"])
    for target in paper["references"]:
        if target["paperId"]:
            citations.append([
                paper["paperId"],
                target["paperId"],
            ])
            all_ids.add(target["paperId"])
citations = pd.DataFrame(citations, columns=["source", "target"])
print(len(all_ids))
citations.to_csv("citations.csv", index=False)
print(len(all_ids))

1288
1288


In [24]:
urls = [paper_url(id, ["title", "embedding"]) for id in all_ids - {None}]
paper_data = await async_map_get(urls)
paper_data = pd.DataFrame(paper_data)
paper_data["embedding"] = paper_data["embedding"].apply(lambda x: x["vector"])
paper_data.to_csv("paper_data.csv", index=False)