In [1]:
inbox_papers = [
    "ImageNet Classification with Deep Convolutional Neural Networks",
    "Very Deep Convolutional Networks for Large-Scale Image Recognition",
    "Deep Residual Learning for Image Recognition",
    "Generative Adversarial Networks",
    "Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks",
    "Wasserstein GAN",
    "Improved Training of Wasserstein GANs",
    "Progressive Growing of GANs for Improved Quality, Stability, and Variation",
    "Reducing the Dimensionality of Data with Neural Networks",
    "Auto-Encoding Variational Bayes",
    "Autoencoding beyond pixels using a learned similarity metric",
    "Neural Discrete Representation Learning",
    "Taming Transformers for High-Resolution Image Synthesis",
    "Scaling the Codebook Size of VQGAN to 100,000 with a Utilization Rate of 99%",
    "Language Models are Unsupervised Multitask Learners",
    "OpenELM: An Efficient Language Model Family with Open Training and Inference Framework",
    "LLaMA: Open and Efficient Foundation Language Models",
    "Llama 2: Open foundation and fine-tuned chat models",
    "High-Resolution Image Synthesis with Latent Diffusion Models",
    "Denoising Diffusion Implicit Models",
    "Denoising Diffusion Probabilistic Models",
]


In [2]:
import requests
import sqlite3
from tqdm import tqdm

In [26]:
db = sqlite3.connect("papers.db")
cursor = db.cursor()

with open("schema.sql") as f:
    cursor.executescript(f.read())
    
db.commit()

In [4]:
URL = "https://api.semanticscholar.org/graph/v1/paper"
endpoints = {
    "match_paper": "/search/match",
    "batch": "/batch"
}
def match_paper(query: str) -> dict:
    response = requests.get(URL + endpoints["match_paper"], params={"query": query})
    response.raise_for_status()
    return response.json()

def batch_requests(query: list[str], fields: list[str]) -> list[dict]:
    response = requests.post(URL + endpoints["batch"], json={"ids": query}, params={'fields': ",".join(fields)})
    response.raise_for_status()
    return response.json()

In [5]:
index_ids = []

for paper in tqdm(inbox_papers): # TODO: make async
    response = match_paper(paper)
    index_ids.append(response["data"][0]["paperId"])

index_ids

  0%|          | 0/21 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:21<00:00,  1.02s/it]


['abd1c342495432171beb7ca8fd9551ef13cbd0ff',
 'eb42cf88027de515750f230b23b1a057dc782108',
 '2c03df8b48bf3fa39054345bafabfeff15bfd11d',
 '13bc4e683075bdd6a3f0155241c276a772d4aa06',
 '8388f1be26329fa45e5807e968a641ce170ea078',
 '2f85b7376769473d2bed56f855f115e23d727094',
 'edf73ab12595c6709f646f542a0d2b33eb20a3f4',
 '744fe47157477235032f7bb3777800f9f2f45e52',
 '7c59908c946a4157abc030cdbe2b63d08ba97db3',
 'ef4f5a50837a7c1b3e87b9300ffc7ba00d461a0f',
 'e8b8a7778ace2a02f8db6fe321a54520c6b283ca',
 'f466157848d1a7772fb6d02cdac9a7a5e7ef982e',
 '47f7ec3d0a5e6e83b6768ece35206a94dc81919c',
 '4240fa51b58a07ce8ffd283298b57f171f1584fc',
 '9405cc0d6169988371b2755e573cc28650d14dfe',
 'db229da82d4821446fb14d084ec58da7d964b7ce',
 '57e849d0de13ed5f91d086936296721d4ff75a75',
 '104b0bb1da562d53cbda87aec79ef6a2827d191a',
 'c10075b3746a9f3dd5811970e93c8ca3ad39b39d',
 '014576b866078524286802b1d0e18628520aa886',
 '5c126ae3421f05768d8edd97ecd44b1364e2c99a']

In [27]:
index_details = batch_requests(index_ids,
['paperId','corpusId','title','abstract','url','year','referenceCount','citationCount','influentialCitationCount',
 'references','references.paperId','references.corpusId','references.title','references.abstract','references.url','references.year','references.referenceCount','references.citationCount','references.influentialCitationCount'])

In [34]:
for read_paper in index_details:
    # Check if the read_ already exists
    cursor.execute("SELECT paperId FROM papers WHERE paperId = ?", (read_paper['paperId'],))
    existing_paper = cursor.fetchone()

    if existing_paper:
        # print(f"Paper {read_paper['title']} already exists in the database.")
        # If the paper exists, update its 'read' status to True
        cursor.execute("UPDATE papers SET read = True WHERE paperId = ?", (read_paper['paperId'],))
        # paper_id = existing_paper[0]  # Get the existing paper's ID
    else:
        # print(f"Inserting paper {read_paper['title']} into the database.")
        # If the paper doesn't exist, insert it
        values = (
            read_paper['paperId'], read_paper['corpusId'], read_paper['title'], read_paper['abstract'], read_paper['url'], 
            read_paper['year'], read_paper['referenceCount'], read_paper['citationCount'], read_paper['influentialCitationCount'], True
        )
        cursor.execute("INSERT INTO papers (paperId, corpusId, title, abstract, url, year, referenceCount, citationCount, influentialCitationCount, read) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", values)
        # paper_id = cursor.lastrowid  # Get the new paper's ID

    db.commit()

    for reference in read_paper['references']:
        # Check if the referenced paper already exists
        cursor.execute("SELECT paperId FROM papers WHERE paperId = ?", (reference['paperId'],))
        existing_reference = cursor.fetchone()
        print(existing_reference)
        if existing_reference:
            # print(f"Referenced paper {reference['title']} already exists in the database.")
            pass
        else:
            # print(f"Inserting referenced paper {reference['title']} into the database.")
            # If the referenced paper doesn't exist, insert it with 'read' as False
            values = (
                reference['paperId'], reference['corpusId'], reference['title'], reference['abstract'], 
                reference['url'], reference['year'], reference['referenceCount'], reference['citationCount'], 
                reference['influentialCitationCount'], False
            )
            cursor.execute("INSERT INTO papers (paperId, corpusId, title, abstract, url, year, referenceCount, citationCount, influentialCitationCount, read) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", values)

        # Insert the reference into the paper_references table
        cursor.execute("INSERT INTO paper_references (paper_id, referenced_paper_id) VALUES (?, ?)", (read_paper['paperId'], reference['paperId']))

        db.commit()

('3a4a53fe47036ac89dad070ab87a9d8795b139b1',)
('5bdfd78fb2285b9306e93bd3a4b534d19bf55f06',)
('0060745e006c5f14ec326904119dca19c6545e51',)
('398c296d0cc7f9d180f84969f8937e6d3a413796',)
('eefcc7bcc05436dac9881acb4ff4e4a0b730e175',)
('82b9099ddf092463f497bd48bb112c46ca52c4d1',)
('d54cb764ce7e132eef60b5eced8752fe7d19377c',)
('c43025c429b1fbf6f1379f61801a1b40834d62e7',)
('a538b05ebb01a40323997629e171c91aa28b8e2f',)
('feacb4cf21eaf068197f80b164827db888ddd28d',)
('d46fd54609e09bcd135fd28750003185a5ee4125',)
('1f88427d7aa8225e47f946ac41a0667d7b69ac52',)
('d2c733e34d48784a37d717fe43d9e93277a8c53e',)
('2a73a19dc5564e8fd3d734fe522e85fe979e8401',)
('1e80f755bcbf10479afd2338cec05211fdbd325c',)
('092c275005ae49dc1303214f6d02d134457c7053',)
('688b6fbc3c5c06e254961f70de9d855d3d008d09',)
('a9b861ea10ec2acdebb0ee9bc29e3360cd09d9b6',)
('5a5effa909cdeafaddbbb7855037e02f8e25d632',)
('f354310098e09c1e1dc88758fca36767fd9d084d',)
('ed9db7b20e019cdb1c7db8b7921221ee2d9f36e2',)
('5562a56da3a96dae82add7de705e2bd8

In [29]:
import pprint

In [30]:
# select * papers
cursor.execute("SELECT count(*) FROM paper_references")
result = cursor.fetchall()
pprint.pprint(result)

[(1304,)]


In [51]:
# find the paper with the most references
cursor.execute("""
SELECT
    papers.id,
    papers.title,
    papers.url,
    papers.read,
    COUNT(*) AS reference_count
FROM
    paper_references
JOIN papers ON papers.id = paper_references.referenced_paper_id
GROUP BY
    paper_references.referenced_paper_id,
    papers.title,
    papers.url,
    papers.read
ORDER BY
    reference_count DESC
LIMIT
    5
""")
result = cursor.fetchall()
pprint.pprint(result)


[(229,
  'Auto-Encoding Variational Bayes',
  'https://www.semanticscholar.org/paper/5f5dc5b9a2ba710937e2c413b37b053cd673df02',
  0,
  11),
 (228,
  'Stochastic Backpropagation and Approximate Inference in Deep Generative '
  'Models',
  'https://www.semanticscholar.org/paper/484ad17c926292fbe0d5211540832a8c8a8e958b',
  0,
  8),
 (264,
  'GENERATIVE ADVERSARIAL NETS',
  'https://www.semanticscholar.org/paper/c68796f833a7151f0a63d1d1608dc902b4fdc9b6',
  0,
  8),
 (14,
  'ImageNet: A large-scale hierarchical image database',
  'https://www.semanticscholar.org/paper/d2c733e34d48784a37d717fe43d9e93277a8c53e',
  0,
  7),
 (280,
  'LSUN: Construction of a Large-scale Image Dataset using Deep Learning with '
  'Humans in the Loop',
  'https://www.semanticscholar.org/paper/4dcdae25a5e33682953f0853ee4cf7ca93be58a9',
  0,
  7)]


In [53]:
# papers that cite 229
cursor.execute("""
SELECT
    papers.id,
    papers.title,
    papers.url,
    papers.read
FROM
    paper_references
JOIN papers ON papers.id = paper_references.paper_id
WHERE
    paper_references.referenced_paper_id = 264
""")
result = cursor.fetchall()
pprint.pprint(result)

[(123,
  'Generative adversarial networks',
  'https://www.semanticscholar.org/paper/13bc4e683075bdd6a3f0155241c276a772d4aa06',
  1),
 (209,
  'Unsupervised Representation Learning with Deep Convolutional Generative '
  'Adversarial Networks',
  'https://www.semanticscholar.org/paper/8388f1be26329fa45e5807e968a641ce170ea078',
  1),
 (185,
  'Wasserstein GAN',
  'https://www.semanticscholar.org/paper/2f85b7376769473d2bed56f855f115e23d727094',
  1),
 (178,
  'Improved Training of Wasserstein GANs',
  'https://www.semanticscholar.org/paper/edf73ab12595c6709f646f542a0d2b33eb20a3f4',
  1),
 (439,
  'Autoencoding beyond pixels using a learned similarity metric',
  'https://www.semanticscholar.org/paper/e8b8a7778ace2a02f8db6fe321a54520c6b283ca',
  1),
 (455,
  'Neural Discrete Representation Learning',
  'https://www.semanticscholar.org/paper/f466157848d1a7772fb6d02cdac9a7a5e7ef982e',
  1),
 (881,
  'Denoising Diffusion Implicit Models',
  'https://www.semanticscholar.org/paper/014576b8660785