## paper ingestion
given a link, I want to ingest the title, abstract, doi, and the doi of the other papers it cites (as well as the papers those papers cite). how many levels down i go in the graph will depend on how stringent my filtering function is.  

currently using [this](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_paper_bulk_search) api from semantic scholar and storing data locally as a json. in the futuer, will aim to use sqlite for persistance instead.



In [2]:

test_doi = '10.1371/journal.pone.0286404'
test_paper_id = 'c951ac9a54bba70c6e8337ab5815f3ac45434ec5'

def create_url(id, is_doi=False, check_citations=False):
    endpoint = "https://api.semanticscholar.org/graph/v1/paper/"
    fields = '?fields=title,abstract,url,year,authors,references'
    if check_citations:
        fields += ',citations'
    if is_doi:
        return endpoint + "DOI:" + id + fields
    else: 
        return endpoint + id + fields

In [3]:
import requests

# calling the semantic scholar api
def api_call(api_endpoint):
    response = requests.get(api_endpoint)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(api_endpoint)
        return None


In [22]:
import time

# a lot of error handling, hasn't broken so far
def try_api_call(api_endpoint):
    try:
        response = requests.get(api_endpoint)
        if response.status_code == 429:
            print("Rate limit reached. Waiting before retrying...")
            time.sleep(20)  # Sleep for a minute, or an appropriate back-off time for your use case
            return api_call(api_endpoint)  # Retry the request
        response.raise_for_status()  # Will raise an HTTPError for other bad status codes
        return response.json()
        ## code to keep only the first 3 authors
        # if 'authors' in data and len(data['authors']) > 3:
        #         data['authors'] = data['authors'][:3]  
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - {api_endpoint}")
        return None
    except Exception as err:
        print(f"An error occurred: {err} - {api_endpoint}")
        return None

In [5]:
import json
# pretty print json object
def pprint(data):
    print(json.dumps(data, indent=4, sort_keys=True))
    

In [19]:
# keywords
ml_keywords=[
    'machine learning', 
    'regression', 
    'neural network', 
    'model',
    'projecting',
    'forecast',
    'predict',
    'modeling',
    'classification', 
    'clustering', 
    'support vector machine', 
    'decision tree', 
    'random forest', 
    'learning', 
    'gradient boosting', 
    'data mining', 
    'natural language processing', 
    'computer vision', 
    'algorithm', 
    'optimization', 
]
conflict_keywords=[
    'armed', 
    'civil',
    'battle', 
    'conflict',
    'warfare',
    'casualties',
    'combat',
    'violen',
    # 'insurgency', 
    # 'terrorism', 
    # 'extremism', 
    # 'revolution', 
    # 'violence', 
    # 'militia', 
    # 'security', 
    # 'peacekeeping', 
    # 'genocide', 
    # 'massacre', 
    # 'ceasefire', 
    # 'rebellion', 
    # 'humanitarian', 
    # 'occupation', 
]
all_keywords = ml_keywords + conflict_keywords

In [7]:
def keyword_check(string):
    string = string.lower()
    return any(keyword in string for keyword in all_keywords)

## used to create v0 of the graph
## generally checks for machine learning + conflicts related keywords (both must be present)
def double_keyword_check(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in ml_keywords)
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2

def paper_is_relevant(json_blob):
    title = json_blob['title']
    if double_keyword_check(title):
        return True
    else:
        abstract = json_blob['abstract']
        return double_keyword_check(abstract)


In [14]:
# "database": will make this a sqlite later
# key: paperId, value: [api call data object, how many times hit]
papers = {}

In [8]:
def filter_refs(paper_refs):
    # first pass at filtering out references that don't pass the general keyword check
    filtered_references = [
        ref for ref in paper_refs
        if keyword_check(ref['title'])
    ]
    return filtered_references

# pprint(filter_refs(res))

In [9]:
# testing all the helpers thus far
    
def add_to_dict(res):
    if res:
        # there was a response
        id = res['paperId']
        if id in papers:
            print('paper already in database')
        else:
            refs = filter_refs(res.pop('references', None))
            papers[id] = [res, refs, 1]
        


In [10]:
def from_a_node(key):
    value = papers[key]
    # iterate through the refs
    for ref in value[1]:
        ref_id = ref['paperId']
        if ref_id in papers:
            # if the ref is already in the database, increment the hit count
            papers[ref_id][2] += 1
        else:  
            # if the ref is not in the database, add it
            url = create_url(ref_id)
            res = api_call(url)
            if res['abstract'] and double_keyword_check(res['abstract']):
                add_to_dict(res)
    # touched again
    papers[key][2] += 1

In [18]:
papers = {}

# add the first node
url = create_url(test_paper_id, False)
res = api_call(url)
add_to_dict(res)

# print(papers)
print(papers)



In [89]:
# no need to run
from_a_node(test_paper_id)

# Define the filename where you want to store the data
filename = 'papers.json'

# Write the dictionary to a file as JSON
with open(filename, 'w') as file:
    json.dump(papers, file, indent=4)

print(f"Dictionary saved to {filename}")

Dictionary saved to papers.json


In [21]:
def save_graph_to_file(graph_data, file_name):
    with open(file_name, 'w') as file:
        json.dump(graph_data, file, indent=4, default=str)  # Using default=str to handle non-serializable data
        
def save_queue_to_file(queue, file_name):
    with open(file_name, 'w') as file:
        listed = [list(tup) for tup in queue]
        json.dump(listed, file, indent=4, default=str)

In [23]:

## defaults to double keyword check (ML + conflicts) as filtering
def generate_reference_network(start_paper_id, max_depth=2, filter_func=double_keyword_check, res_name='graph.json', check_citations=False):
    
    ## setting up all variables
    graph = {}  # Initialize an empty graph, key: paper_id, value: paper_blob, list of references
    start_blob = try_api_call(create_url(start_paper_id, check_citations=check_citations))
    queue = [(start_paper_id, start_blob, 0)]  # Queue of (paper_id, current_depth)
    visited = set()  # Set of visited paper IDs to avoid duplicates
    save_interval = 1
    processed_papers = 0  

    while queue:
        current_paper_id, paper_info, current_depth = queue.pop(0)
        ## retrying failed api calls added to end of queue
        if not paper_info:
            url = create_url(ref_id, check_citations=check_citations)
            paper_info = try_api_call(url)
        if paper_info and current_paper_id not in visited and current_depth <= max_depth:
            print(f"Processing paper {current_paper_id} at depth {current_depth}")
            visited.add(current_paper_id)
    
            # Retrieve references
            if 'references' not in paper_info:
                continue
            references = filter_refs(paper_info.pop('references', None))
            print("------- the num of references are: ", len(references))
            relevant_refs = []
            
            ## sheer off excess refs
            if len(references) > 100:
                references = references[:100]
            
            children = references
            
            ## retrieve citations, if requested
            if check_citations and 'citations' in paper_info:
                if 'citations' in paper_info:
                    citations = filter_refs(paper_info.pop('citations', None))
                    print("------- the num of citations are: ", len(citations))
                    if len(citations) > 100:
                        citations = citations[:100]
                    children += citations
            
            print("------- the num of children are: ", len(children))
            
            ## iterate through references and citations
            for ref in children:
                ref_id = ref['paperId']
                if ref_id:
                    url = create_url(ref_id, check_citations=check_citations)
                    res = try_api_call(url)
                    
                    ## recompute 
                    if not res:
                        queue.append((ref_id, None, current_depth + 1))
                    elif res and res['abstract'] and filter_func(res['abstract']):
                        relevant_refs.append(ref_id) 
                        queue.append((ref_id, res, current_depth + 1))
                    
            ## add processed paper to graph
            graph[current_paper_id] = (paper_info, relevant_refs, current_depth)
            print("------- the rel references are: ", relevant_refs)

            processed_papers += 1
            
            # Save the graph every three papers processed
            if processed_papers % save_interval == 0:
                save_graph_to_file(graph, res_name)
                save_queue_to_file(queue, 'queue.json')
                print(f"Saved backup of graph + queue.")
                
    return graph



In [48]:
## took 23 min 40 sec to get to 129 papers
## rate is about 5 papers per minute

# This will return a graph dictionary with each paper ID as keys and 
# the tuple (paper_info, lists of relevant references) as values.
# reference_network = generate_reference_network(test_paper_id)

## Random Forest Network
Trying this exclusively for papers about RF as a conflict modeling tactic

TODO
- [x] when api gateway time out happens, append them to the back of the queue
- [x] add depth information to the network as part of 
- [ ] adding functionality to terminate and keep running 
- [x] clip the authors after more than three authors
- [x] if something has too many references or citations, skip it: 25badc676197a70aaf9911865eb03469e402ba57
- [ ] add queue to graph
- [x] dump queue into file

In [24]:
## RANDOM FOREST PAPERS
# first node: 
# 10.1371/journal.pone.0286404

rf_keywords = [
    'forest',
    'tree',
    'random',
    'decision tree',
    'regression trees',
    'boost',
    'gradient',
    'bagging',
    'boosting',
    'lightgbm',
    'cart',
    'adaboost',
]

## generally checks for keywords relating to random forest classification
def filter_trees(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in rf_keywords)
    # for speed
    if not match1:
        return False
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2


In [25]:
## running the random forest tree generation 
# title: Comparing Random Forest with Logistic Regression for Predicting Class-Imbalanced Civil War Onset Data
# published: 2016

rf_doi = '10.1093/pan/mpv024'
res = try_api_call(create_url(rf_doi, True, True))

rf_paper_id = '4b6555beef240120bacb699c7d2f7c8e806b5747'

In [26]:
## Actual generation
# time: 6 min 20s for 26 papers

rf_network = generate_reference_network(rf_paper_id, max_depth=3, filter_func=filter_trees, res_name='rf_graph.json', check_citations=True)

Processing paper 4b6555beef240120bacb699c7d2f7c8e806b5747 at depth 0
------- the num of references are:  46
------- the num of citations are:  141
------- the num of children are:  146
HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.semanticscholar.org/graph/v1/paper/a4d4053fa12ac75164fe2df0b20a4d3883c292c3?fields=title,abstract,url,year,authors,references,citations - https://api.semanticscholar.org/graph/v1/paper/a4d4053fa12ac75164fe2df0b20a4d3883c292c3?fields=title,abstract,url,year,authors,references,citations
HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.semanticscholar.org/graph/v1/paper/5455087bbbbddae0e76fb85690758aec9fcce69e?fields=title,abstract,url,year,authors,references,citations - https://api.semanticscholar.org/graph/v1/paper/5455087bbbbddae0e76fb85690758aec9fcce69e?fields=title,abstract,url,year,authors,references,citations
------- the rel references are:  ['17c7d2aada72fe59c8d282235b26089d526e44ee', '12e64de7bba4

In [34]:
# try_api_call(create_url("10.1007/978-3-319-55708-3_19", True))
