## Paper ingestion
given a link, I want to ingest the title, abstract, doi, and the doi of the other papers it cites. 

will likely use [this](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_paper_bulk_search) api from semantic scholar and sqlite for the database



In [11]:

test_doi = '10.1371/journal.pone.0286404'
test_paper_id = 'c951ac9a54bba70c6e8337ab5815f3ac45434ec5'

def create_url(id, is_doi):
    endpoint = "https://api.semanticscholar.org/graph/v1/paper/"
    fields = '?fields=title,abstract,url,year,authors,references'
    if is_doi:
        return endpoint + "DOI:" + id + fields
    else: 
        return endpoint + id + fields

In [44]:
import requests

# calling the semantic scholar api
def api_call(api_endpoint):
    response = requests.get(api_endpoint)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(api_endpoint)
        return None
    


In [18]:
import json

# pretty print json object
def pprint(data):
    print(json.dumps(data, indent=4, sort_keys=True))
    

In [39]:
# keywords
ml_keywords=[
    'machine learning', 
    'regression', 
    'neural network', 
    'model',
    'projecting',
    'forecast',
    'predict',
    'modeling',
    'classification', 
    'clustering', 
    'support vector machine', 
    'decision tree', 
    'random forest', 
    'learning', 
    'gradient boosting', 
    'data mining', 
    'natural language processing', 
    'computer vision', 
    'algorithm', 
    'optimization', 
]
conflict_keywords=[
    'armed', 
    'civil war', 
    'conflict',
    'insurgency', 
    'terrorism', 
    'extremism', 
    'revolution', 
    'violence', 
    'warfare', 
    'battle', 
    'combat', 
    'militia', 
    'security', 
    'peacekeeping', 
    'genocide', 
    'massacre', 
    'ceasefire', 
    'rebellion', 
    'humanitarian', 
    'occupation', 
]
all_keywords = ml_keywords + conflict_keywords

In [38]:
def keyword_check(string):
    string = string.lower()
    return any(keyword in string for keyword in all_keywords)

def double_keyword_check(string):
    string = string.lower()
    match1 = any(keyword.lower() in string for keyword in ml_keywords)
    match2 = any(keyword.lower() in string for keyword in conflict_keywords)
    return match1 and match2

def paper_is_relevant(json_blob):
    title = json_blob['title']
    if double_keyword_check(title):
        return True
    else:
        abstract = json_blob['abstract']
        return double_keyword_check(abstract)


In [60]:
# "database": will make this a sqlite later
# key: paperId, value: [api call data object, how many times hit]
papers = {}

In [64]:
def filter_refs(paper_refs):
    # first pass at filtering out references that don't pass the general keyword check
    filtered_references = [
        ref for ref in paper_refs
        if keyword_check(ref['title'])
    ]
    return filtered_references

# pprint(filter_refs(res))

In [65]:
# testing all the helpers thus far
def add_to_dict(url):
    res = api_call(url)
    if res:
        # there was a response
        id = res['paperId']
        if id in papers:
            papers[id][2] += 1
        else:
            # refs = filter_refs(res)
            refs = filter_refs(res.pop('references', None))
            papers[id] = [res, refs, 1]
        


In [66]:
url = create_url(test_paper_id, False)
add_to_dict(url)

# print(papers)
print(papers[test_paper_id])



In [70]:
pprint(papers[test_paper_id][0])

{
    "authors": [
        {
            "authorId": "2111366015",
            "name": "Xiaolan Xie"
        },
        {
            "authorId": "2004912912",
            "name": "Dong Jiang"
        },
        {
            "authorId": "145381688",
            "name": "Mengmeng Hao"
        },
        {
            "authorId": "11258201",
            "name": "Fangyu Ding"
        }
    ],
    "paperId": "c951ac9a54bba70c6e8337ab5815f3ac45434ec5",
    "title": "Modeling analysis of armed conflict risk in sub-Saharan Africa, 2000\u20132019",
    "url": "https://www.semanticscholar.org/paper/c951ac9a54bba70c6e8337ab5815f3ac45434ec5",
    "year": 2023
}


In [33]:
import networkx as nx

# Create an empty directed graph to represent citations (paper A cites paper B)
G = nx.DiGraph()