## Paper ingestion
given a link, I want to ingest the title, abstract, doi, and the doi of the other papers it cites. 

will likely use [this](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_paper_bulk_search) api from semantic scholar and sqlite for the database



In [2]:

test_doi = '10.1371/journal.pone.0286404'
test_paper_id = 'c951ac9a54bba70c6e8337ab5815f3ac45434ec5'

def create_url(id, is_doi=False):
    endpoint = "https://api.semanticscholar.org/graph/v1/paper/"
    fields = '?fields=title,abstract,url,year,authors,references'
    if is_doi:
        return endpoint + "DOI:" + id + fields
    else: 
        return endpoint + id + fields

In [3]:
import requests

# calling the semantic scholar api
def api_call(api_endpoint):
    response = requests.get(api_endpoint)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(api_endpoint)
        return None


In [25]:
import time

def try_api_call(api_endpoint):
    try:
        response = requests.get(api_endpoint)
        if response.status_code == 429:
            print("Rate limit reached. Waiting before retrying...")
            time.sleep(20)  # Sleep for a minute, or an appropriate back-off time for your use case
            return api_call(api_endpoint)  # Retry the request
        response.raise_for_status()  # Will raise an HTTPError for other bad status codes
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - {api_endpoint}")
    except Exception as err:
        print(f"An error occurred: {err} - {api_endpoint}")
    return None

In [4]:
import json

# pretty print json object
def pprint(data):
    print(json.dumps(data, indent=4, sort_keys=True))
    

In [5]:
# keywords
ml_keywords=[
    'machine learning', 
    'regression', 
    'neural network', 
    'model',
    'projecting',
    'forecast',
    'predict',
    'modeling',
    'classification', 
    'clustering', 
    'support vector machine', 
    'decision tree', 
    'random forest', 
    'learning', 
    'gradient boosting', 
    'data mining', 
    'natural language processing', 
    'computer vision', 
    'algorithm', 
    'optimization', 
]
conflict_keywords=[
    'armed', 
    'civil war', 
    'conflict',
    'insurgency', 
    'terrorism', 
    'extremism', 
    'revolution', 
    'violence', 
    'warfare', 
    'battle', 
    'combat', 
    'militia', 
    'security', 
    'peacekeeping', 
    'genocide', 
    'massacre', 
    'ceasefire', 
    'rebellion', 
    'humanitarian', 
    'occupation', 
]
all_keywords = ml_keywords + conflict_keywords

In [6]:
def keyword_check(string):
    string = string.lower()
    return any(keyword in string for keyword in all_keywords)

def double_keyword_check(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in ml_keywords)
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2

def paper_is_relevant(json_blob):
    title = json_blob['title']
    if double_keyword_check(title):
        return True
    else:
        abstract = json_blob['abstract']
        return double_keyword_check(abstract)


In [7]:
# "database": will make this a sqlite later
# key: paperId, value: [api call data object, how many times hit]
papers = {}

In [8]:
def filter_refs(paper_refs):
    # first pass at filtering out references that don't pass the general keyword check
    filtered_references = [
        ref for ref in paper_refs
        if keyword_check(ref['title'])
    ]
    return filtered_references

# pprint(filter_refs(res))

In [9]:
# testing all the helpers thus far
    
def add_to_dict(res):
    if res:
        # there was a response
        id = res['paperId']
        if id in papers:
            print('paper already in database')
        else:
            # refs = filter_refs(res)
            refs = filter_refs(res.pop('references', None))
            papers[id] = [res, refs, 1]
        


In [13]:
def from_a_node(key):
    value = papers[key]
    # iterate through the refs
    for ref in value[1]:
        ref_id = ref['paperId']
        if ref_id in papers:
            # if the ref is already in the database, increment the hit count
            papers[ref_id][2] += 1
        else:  
            # if the ref is not in the database, add it
            url = create_url(ref_id)
            res = api_call(url)
            if res['abstract'] and double_keyword_check(res['abstract']):
                add_to_dict(res)
    # touched again
    papers[key][2] += 1

In [12]:
papers = {}

# add the first node
url = create_url(test_paper_id, False)
res = api_call(url)
add_to_dict(res)

# print(papers)
print(papers)



In [84]:
from_a_node(test_paper_id)

{'paperId': '1ee89d8c4546a34238d715c073d248617d0f67ff', 'url': 'https://www.semanticscholar.org/paper/1ee89d8c4546a34238d715c073d248617d0f67ff', 'title': 'Quantifying the influence of climate variability on armed conflict in Africa, 2000–2015', 'abstract': None, 'year': 2022, 'authors': [{'authorId': '2155616795', 'name': 'Qian Wang'}, {'authorId': '145381688', 'name': 'Mengmeng Hao'}, {'authorId': '39172521', 'name': 'D. Helman'}, {'authorId': '11258201', 'name': 'Fangyu Ding'}, {'authorId': '2004912912', 'name': 'Dong Jiang'}, {'authorId': '2111366015', 'name': 'Xiaolan Xie'}, {'authorId': '2110791008', 'name': 'Shuai Chen'}, {'authorId': '2147331267', 'name': 'Tian Ma'}], 'references': [{'paperId': 'f430e7bffb938f921f479993c962b2cf93856589', 'title': 'Security implications of climate change: A decade of scientific progress'}, {'paperId': 'df55a75026085154d0a7cbdf794999939d372b3c', 'title': 'Temperature anomalies affect violent conflicts in African and Middle Eastern warm regions'}, 

In [1]:
papers

NameError: name 'papers' is not defined

In [89]:
# Define the filename where you want to store the data
filename = 'papers.json'

# Write the dictionary to a file as JSON
with open(filename, 'w') as file:
    json.dump(papers, file, indent=4)

print(f"Dictionary saved to {filename}")

Dictionary saved to papers.json


In [20]:
def save_graph_to_file(graph_data, file_name):
    with open(file_name, 'w') as file:
        json.dump(graph_data, file, indent=4, default=str)  # Using default=str to handle non-serializable data

In [26]:

def generate_reference_network(start_paper_id, max_depth=2):
    graph = {}  # Initialize an empty graph, key: paper_id, value: paper_blob, list of references
    start_blob = try_api_call(create_url(start_paper_id))
    queue = [(start_paper_id, start_blob, 0)]  # Queue of (paper_id, current_depth)
    visited = set()  # Set of visited paper IDs to avoid duplicates
    save_interval = 3  
    processed_papers = 0  

    while queue:
        current_paper_id, paper_info, current_depth = queue.pop(0)
        if current_paper_id not in visited and current_depth <= max_depth:
            print(f"Processing paper {current_paper_id} at depth {current_depth}")
            visited.add(current_paper_id)
    
            # Retrieve references and check if the paper meets the criteria
            references = filter_refs(paper_info.pop('references', None))
            relevant_refs = []
            for ref in references:
                ref_id = ref['paperId']
                if ref_id:
                    url = create_url(ref_id)
                    res = try_api_call(url)
                    if res and res['abstract'] and double_keyword_check(res['abstract']):
                        relevant_refs.append(ref_id) 
                        queue.append((ref_id, res, current_depth + 1))
            
            graph[current_paper_id] = (paper_info, relevant_refs)
            processed_papers += 1
            
            # Save the graph every three papers processed
            if processed_papers % save_interval == 0:
                save_graph_to_file(graph, 'graph_backup.json')
                print(f"Saved backup of graph at {processed_papers} papers.")
                
            ## add something so that if the API rate limit is hit, we are still ok

    return graph

# This will return a graph dictionary with each paper ID as keys and lists of their references as values.
reference_network = generate_reference_network(test_paper_id)


Processing paper c951ac9a54bba70c6e8337ab5815f3ac45434ec5 at depth 0
Processing paper 5e2092e569a49b8404e4646eb7bfc8df51bc2fba at depth 1
Processing paper 01028e69dfd57d2738dcb39c057bbf952dbcb0af at depth 1
Saved backup of graph at 3 papers.
Processing paper b8974a3094f93e59507c3b775822f3e819ead4db at depth 1
Processing paper 1e52a479cad4fedd32384aa346233203f9d19bbf at depth 1
Processing paper e3239a1afb502fe194c5fda4675662772a829a1d at depth 1
Saved backup of graph at 6 papers.
Processing paper 2e128cd02947725de76622dc04435b217fe4d56d at depth 1
Processing paper aeac3f12ec7b2a490f1f67dcb6c13bc58faaed83 at depth 1
HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.semanticscholar.org/graph/v1/paper/142505bae9092220e0ae4583a7fa6576e84c2a22?fields=title,abstract,url,year,authors,references - https://api.semanticscholar.org/graph/v1/paper/142505bae9092220e0ae4583a7fa6576e84c2a22?fields=title,abstract,url,year,authors,references
Processing paper 2a31dd2f126cd187cc5

In [None]:
## took 23 min 40 sec to get to 129 papers
## rate is about 5 papers per minute