In [22]:
import pandas as pd
import numpy as np
from requests import get

In [23]:
paper_data = pd.read_csv("data/paper_data.csv")
citations = pd.read_csv("data/citations.csv")

In [24]:
missingpapers = np.setdiff1d(citations.target, paper_data.paperId)

def paper_url(paper_id, fields):
    return f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={','.join(fields)}"

urls = [paper_url(paper_id, ["title", "embedding"]) for paper_id in missingpapers]
urls

['https://api.semanticscholar.org/graph/v1/paper/172dbf8c7352bab99cdbfb6ebad04349ac2444f0?fields=title,embedding',
 'https://api.semanticscholar.org/graph/v1/paper/2d1343d550e9c575594485851df8e78941a07511?fields=title,embedding',
 'https://api.semanticscholar.org/graph/v1/paper/5aef733cdfad14196e49443e974d0dbf49a0dd2e?fields=title,embedding']

In [25]:
results = [get(url).json() for url in urls]
new_df = pd.DataFrame(results)
new_df["embedding"] = new_df["embedding"].apply(lambda x: x["vector"])
new_df.head()

Unnamed: 0,paperId,title,embedding
0,58a5cf0cb1b243062465500f68321b639f6cd306,NONPARAMETRIC IDENTIFICATION,"[-3.0155277252197266, -2.0252156257629395, -1...."
1,1276d81dd7dbf684d501ea9154661a8babfaf8c9,Operations management.,"[0.37417685985565186, -2.443171739578247, 1.08..."
2,e109a80439cc73611f70315cff2cc7e9b4e34f7d,Gaussian Processes for Global Optimization,"[-3.205744743347168, -0.6856855154037476, 0.63..."


In [26]:
combined = pd.concat([paper_data, new_df], ignore_index=True)
combined.head()

Unnamed: 0,paperId,title,embedding
0,b1cdd6b0ab88dd50b2d228854bd9de3512785444,Optimizing Energy Management Strategy and Degr...,"[-1.3419313430786133, 2.403834819793701, -3.24..."
1,e2316b446637ac8e7363af4a95e95b49e3178d97,Data driven quantitative trust model for the I...,"[-6.265685558319092, -2.0490870475769043, -2.8..."
2,b1e538dbf538fd9fdf5f5870c5b7416ae08c9882,Differentially Private Federated Learning: A C...,"[-5.616217613220215, -0.3546842336654663, -1.3..."
3,0300e96335922acbdee5b9ace2529c2c2c5726a9,Machine Recognition of Hand Printing,"[1.551882028579712, -4.45127010345459, -0.4198..."
4,6fc4c7a500a90bb23dbd33d3020338ea3f707019,Bayesian optimization for sensor set selection,"[-2.694502830505371, 0.4022102355957031, -1.16..."


In [27]:
replacements = dict(zip(missingpapers, new_df.paperId))
new_target = citations.target.replace(replacements)
citations.target = new_target

In [29]:
citations.to_csv("data/citations.csv", index=False)
combined.to_csv("data/paper_data.csv", index=False)