## paper ingestion (historical)

** this is an edited version of ingestion.ipynb to focus only on the larger, seminal papers over the last few decades. cutoff is papers that have been cited 50 times. 

using the 2010 goldstone paper as the starting point, found [here](https://www.semanticscholar.org/paper/A-Global-Model-for-Forecasting-Political-Goldstone-Bates/5642a582b9a9a238e0211ca85e796e9238cd6057)


In [66]:

test_doi = '10.1111/J.1540-5907.2009.00426.X'

def create_url(id, is_doi=False, full_details=True):
    endpoint = "https://api.semanticscholar.org/graph/v1/paper/"
    fields = '?fields=title,abstract,url,year,authors,referenceCount,citationCount'
    if full_details:
        fields += ',citations,references'
    
    if is_doi:
        return endpoint + "DOI:" + id + fields
    else: 
        return endpoint + id + fields

In [29]:
import requests
import time

# calling the semantic scholar api
def api_call(api_endpoint):
    response = requests.get(api_endpoint)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(api_endpoint)
        return None

# a lot of error handling, hasn't broken so far
def try_api_call(api_endpoint):
    try:
        response = requests.get(api_endpoint)
        if response.status_code == 429:
            print("Rate limit reached. Waiting before retrying...")
            time.sleep(20)  # Sleep for a sec
            return api_call(api_endpoint)  # Retry the request
        response.raise_for_status()  # Will raise an HTTPError for other bad status codes
        return response.json()
        ## code to keep only the first 3 authors
        # if 'authors' in data and len(data['authors']) > 3:
        #         data['authors'] = data['authors'][:3]  
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - {api_endpoint}")
        return None
    except Exception as err:
        print(f"An error occurred: {err} - {api_endpoint}")
        return None


In [30]:
test_url = create_url(test_doi, is_doi=True, check_citations=True)
test_json = try_api_call(test_url)

In [26]:
import json
# pretty print json object
def pprint(data):
    print(json.dumps(data, indent=4, sort_keys=True))
    

In [31]:
# keywords
pred_keywords=[
    'machine learning', 
    'forecast',
    'predict',
    'regression', 
    'neural network', 
    'model',
    'projecting',
    'modeling',
    'classification', 
    'clustering', 
    'support vector machine', 
    'decision tree', 
    'random forest', 
    'learning', 
    'gradient boosting', 
    'natural language processing', 
    'algorithm', 
    'optimization', 
]
conflict_keywords=[
    'armed', 
    'civil',
    'battle', 
    'conflict',
    'warfare',
    'casualties',
    'combat',
]
all_keywords = pred_keywords + conflict_keywords

In [32]:
# loose check on title
def keyword_check(string):
    string = string.lower()
    return any(keyword in string for keyword in all_keywords)

## generally checks for prediction + conflicts related keywords (both must be present in the abstract)
def double_keyword_check(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in pred_keywords)
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2

def paper_is_relevant(json_blob):
    title = json_blob['title']
    if double_keyword_check(title):
        return True
    else:
        abstract = json_blob['abstract']
        return double_keyword_check(abstract)


In [35]:
def filter_refs(paper_refs):
    # first pass at filtering out references that don't pass the general keyword check
    filtered_references = [
        ref for ref in paper_refs
        if keyword_check(ref['title'])
    ]
    return filtered_references


In [45]:
# testing all the helpers thus far
# key: paperId, value: [api call data object, how many times hit]
papers = {}

def add_to_dict(res):
    if res:
        # there was a response
        id = res['paperId']
        if id in papers:
            print('paper already in database')
        else:
            refs = filter_refs(res.pop('references', None))
            papers[id] = [res, refs, 1]
        


In [38]:
# not used i think
def from_a_node(key):
    value = papers[key]
    # iterate through the refs
    for ref in value[1]:
        ref_id = ref['paperId']
        if ref_id in papers:
            # if the ref is already in the database, increment the hit count
            papers[ref_id][2] += 1
        else:  
            # if the ref is not in the database, add it
            url = create_url(ref_id)
            res = api_call(url)
            if res['abstract'] and double_keyword_check(res['abstract']):
                add_to_dict(res)
    # touched again
    papers[key][2] += 1

In [46]:

# add the first node
url = create_url(test_doi, True, True)
res = try_api_call(url)
add_to_dict(res)

# print(papers)
print(papers)



In [47]:
# Define the filename where you want to store the data
filename = 'hist_papers.json'

# Write the dictionary to a file as JSON
with open(filename, 'w') as file:
    json.dump(papers, file, indent=4)

print(f"Dictionary saved to {filename}")

Dictionary saved to hist_papers.json


In [69]:
def save_dict_to_file(graph_data, file_name):
    with open(file_name, 'w') as file:
        json.dump(graph_data, file, indent=4, default=str)  # Using default=str to handle non-serializable data
        
def save_queue_to_file(queue, file_name):
    with open(file_name, 'w') as file:
        listed = [list(tup) for tup in queue]
        json.dump(listed, file, indent=4, default=str)

In [70]:

## defaults to double keyword check (ML + conflicts) as filtering
def generate_reference_network(start_paper_id, max_depth=2, filter_func=double_keyword_check, res_name='graph.json'):
    seminal = {}
    
    ## setting up all variables
    graph = {}  # Initialize an empty graph, key: paper_id, value: paper_blob, list of references
    start_blob = try_api_call(create_url(start_paper_id))
    print(f"start_blob is {start_blob}")
    queue = [(start_paper_id, start_blob, 0)]  # Queue of (paper_id, current_depth)
    seminal[start_paper_id] = { "url": start_blob["url"], "title": start_blob["title"], "year": start_blob["year"], "citationCount": start_blob["citationCount"]}
    
    visited = set()  # Set of visited paper IDs to avoid duplicates
    save_interval = 1
    processed_papers = 0  

    while queue:
        current_paper_id, paper_info, current_depth = queue.pop(0)
        ## retrying failed api calls added to end of queue
        if not paper_info:
            url = create_url(current_paper_id)
            paper_info = try_api_call(url)
        if paper_info and current_paper_id not in visited and current_depth <= max_depth:
            print(f"Processing paper {current_paper_id} at depth {current_depth}")
            visited.add(current_paper_id)
            
            if paper_info['citationCount'] > 50:
                seminal[current_paper_id] = { "url": paper_info["url"], "title": paper_info["title"], "year": paper_info["year"], "citationCount": paper_info["citationCount"]}
                print(f"Seminal paper found: {paper_info['title']} with {paper_info["citationCount"]} citations")
    
            # Retrieve references
            if 'references' not in paper_info:
                print('no refs found')
                continue
            
            references = filter_refs(paper_info.pop('references', None))
            print("------- the num of references are: ", len(references))
            relevant_refs = []
            
            ## sheer off excess refs
            if len(references) > 100:
                references = references[:100]
            
            children = references
            
            ## retrieve citations, if requested
            if 'citations' in paper_info:
                if 'citations' in paper_info:
                    citations = filter_refs(paper_info.pop('citations', None))
                    print("------- the num of citations are: ", len(citations))
                    if len(citations) > 100:
                        citations = citations[:100]
                    children += citations
            
            print("------- the num of children are: ", len(children))
            
            ## iterate through references and citations
            for ref in children:
                ref_id = ref['paperId']
                if ref_id:
                    url = create_url(ref_id)
                    res = try_api_call(url)
                    
                    ## recompute 
                    if not res:
                        queue.append((ref_id, None, current_depth + 1))
                    elif res and res['abstract'] and filter_func(res['abstract']):
                        relevant_refs.append(ref_id) 
                        queue.append((ref_id, res, current_depth + 1))
                    
            ## add processed paper to graph
            graph[current_paper_id] = (paper_info, relevant_refs, current_depth)
            print("------- the rel references are: ", relevant_refs)

            processed_papers += 1
            
            # Save the graph every three papers processed
            if processed_papers % save_interval == 0:
                save_dict_to_file(graph, res_name)
                save_queue_to_file(queue, 'queue.json')
                save_dict_to_file(seminal, 'seminal.json')
                print(f"**** Saved backup of graph + queue at {processed_papers} papers.")
                
    return graph



In [None]:
## notes from random forest run 
## took 23 min 40 sec to get to 129 papers
## rate is about 5 papers per minute

# This will return a graph dictionary with each paper ID as keys and 
# the tuple (paper_info, lists of relevant references) as values.
# reference_network = generate_reference_network(test_paper_id)

## running the analysis 

In [52]:
## generally checks for keywords relating to prediction
def filter_pred(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in pred_keywords)
    # for speed
    if not match1:
        return False
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2


In [50]:
## running the seminal paper algorithm 
# title: Comparing Random Forest with Logistic Regression for Predicting Class-Imbalanced Civil War Onset Data
# published: 2016

res = try_api_call(create_url(test_doi, is_doi=True, full_details=True))
hist_paper_id = '5642a582b9a9a238e0211ca85e796e9238cd6057'

In [71]:
## Actual generation
# time: 238m 46.5s
# 461 papers

hist_network = generate_reference_network(hist_paper_id, max_depth=3, filter_func=filter_pred, res_name='hist_papers.json')

Processing paper 5642a582b9a9a238e0211ca85e796e9238cd6057 at depth 0
Seminal paper found: A Global Model for Forecasting Political Instability with 745 citations
------- the num of references are:  24
------- the num of citations are:  273
------- the num of children are:  124
------- the rel references are:  ['257cb4317665951b77aa52d4510c31c06c11de87', '40a41644bf3682f9153b717e898c0e5364363618', 'b4fdbc2ba5d4278f39538cc1ad87048c3e2b84c6', '612de74da35f0270f7c56267c7313483df78ac6a', 'adf93e726f33066c6d9ac53332274b287878786d', '0a737f9ccd28a2409471388fedac2c07e95ae5c9', 'e3004293f727e6cfaa0b5e3cd918efbf4310a9f6', '2e8fe12635daa9aedab25eba850881bf26412cff', '5aad2cf82e1717ab101786f14571bb318399da64', 'c951ac9a54bba70c6e8337ab5815f3ac45434ec5', '6b61207075681296cb1709ec55b092b5877489b8', '5b5d1e14f01af4e1b77ab69f0b3ebe9cc7ca04f2', 'c096fe94092cbc01d10c412152fcfca8f60284d8', '2f7057d8dad78605481df528a2ffd3891b670e97', '7ba687152ed7ab2d59e44563514b94454dde5ea7', '2814386d119dd12615a3623fb5a

KeyboardInterrupt: 