# Funding Data Utility

### Note:
Please ensure all of the imports listed in the first code block are available and can be used in your notebook environment before running. Some of these may be available by defualt, as part of Ananconda or similar. Others can be installed using pip commands.

Before running, please read the doccumentation to ensure all the required files are in the right place. Then, navigate to "cell", and execute the command "run all". Scroll down to see statistics.

In [3]:
import pandas as pd
import numpy as np
import bibtexparser
import rispy
import json
import requests
import sys

In [7]:
file = open('config.json')
file = json.load(file)

In [8]:
x = file['input']['file']

In [9]:
try:
    #if file is a .bib -> use bibtexparser
    if '.bib' in x:
        y = open(x, "r", encoding = file['input']['encoding'])
        y = y.read()
        y = bibtexparser.loads(y)
        df = pd.DataFrame(y.entries)

    #if file is a .ris -> use rispy
    elif '.ris' in x:
        y = open(x, "r", encoding = file['input']['encoding'])
        y = rispy.load(y)
        df = pd.DataFrame(y)
        #df = df.rename(columns = {"authors": "author"})  code to clean specific table name to work with sample input data

    #standard csv input
    else:
        df = pd.read_csv(x, encoding = file['input']['encoding'])
except:
    print("Error: bad input")
    sys.exit()

In [25]:
# Creates output dataframe
df2 = pd.DataFrame()
# Bases dataframe on the research work titles
df2['title'] = df.title

# create empty columns
funders = []
found = []
for i in range(len(df)):
    funders.append('')
    found.append(False)
    
# Specifies a few fields used to calculate the statistics
df2['funder'] = funders
df2['found'] = found # for calculating statistics, will be remove before output
df2['doi'] = found # for calculating statistics, will be removed before output


counter = 0
for i in df.doi:
    # Check for if ther is a DOI
    if i == None or type(i) != str:
        # No DOI
        q = 'https://api.crossref.org/works/?'
        if (df['author'][counter] != None and type(df['author'][counter]) == str):
            #adds author if present
            if (type(df['author'][counter]) == list):
                q = q + 'query.bibliographic=' + df['author'][counter][0] + '&'
            else:
                q = q + 'query.bibliographic=' + df['author'][counter] + '&'
        if (df['title'][counter] != None and type(df['title'][counter]) == str):
            #adds title if present
            q = q + 'query.bibliographic=' + df['title'][counter] + '&'
        if ('date' in df.columns):
            #adds date if present
            if (df['date'][counter] != None):
                q = q + 'query.bibliographic=' + df['title'][counter] + '&'
                response3 = requests.get('https://api.crossref.org/works/?query.bibliographic=' + df['date'][counter])
            
        if (q[-1] != '&'):
            #move to next entry if none of the above metadata is given
            continue
        q = q[:-1]
        response = requests.get(q)
        if response.status_code == 200:
            df2['found'][counter] = True
            x = json.loads(response.content.decode('utf-8'))['message']
            #if return json has a funder
            if 'funder' in x.keys():
                df2['funder'][counter] = x['funder'][0]['name']
            counter = counter + 1
            continue
        else:
            counter = counter + 1
            continue
            
    # Has a DOI
    df2.doi[counter] = True
    response = requests.get('https://api.crossref.org/works/' + i)
    if response.status_code == 200:
        df2['found'][counter] = True
        x = json.loads(response.content.decode('utf-8'))['message']
    else:
        counter = counter + 1
        continue
    #if return json has a funder
    if 'funder' in x.keys():
        df2['funder'][counter] = x['funder'][0]['name']
    counter = counter + 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.doi[counter] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['found'][counter] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['funder'][counter] = x['funder'][0]['name']


KeyboardInterrupt: 

In [11]:
df2.head(20)

Unnamed: 0,title,funder,funders_doi,found,doi
0,Effect of two physical exercise protocols on c...,Coordenação de Aperfeiçoamento de Pessoal de N...,,True,True
1,Effect of a qigong exercise programme on elder...,,,True,True
2,"Aerobic exercise, psychological well-being, an...",,,True,True
3,Above-moderate physical activity reduces both ...,National Research Foundation of Korea,,True,True
4,A randomized controlled clinical trial of the ...,,,True,True
5,"A prospective study of overweight, physical ac...",,,True,True
6,Longitudinal association between habitual phys...,Tokyo Metropolitan Institute of Gerontology,,True,True
7,Physical inactivity and depression in the comm...,,,True,True
8,Leisure-time physical activity and marital sta...,,,True,True
9,Dysglycemia in a community sample of people tr...,,,True,True


In [26]:
print("Find Rate: ", df2.found.sum()/len(df2))
print("DOI Find Rate: ", len(df2[(df2.found == True) & (df2.doi == True)])/len(df2[df2.doi == True]))
print("Non DOI Find Rate: ", len(df2[(df2.found == True) & (df2.doi == False)])/len(df2[df2.doi == False]))
print("Funder Find Rate", len(df2[df2.funder != ""])/len(df2))
print("DOI Funder Find Rate", len(df2[(df2.funder != "") & (df2.doi == True)])/len(df2[df2.doi == True]))
print("Non DOI Funder Find Rate", len(df2[(df2.funder != "") & (df2.doi == False)])/len(df2[df2.doi == False]))

#uncomment if you want stat results in a separate file
#statfile = open("results.txt", "w")
#statfile.write("Find Rate: " + str(df2.found.sum()/len(df2)) + '\n')
#statfile.write("DOI Find Rate: " + str(len(df2[(df2.found == True) & (df2.doi == True)])/len(df2[df2.doi == True])) + '\n')
#statfile.write("Non DOI Find Rate: " + str(len(df2[(df2.found == True) & (df2.doi == False)])/len(df2[df2.doi == False])) + '\n')
#statfile.write("Funder Find Rate: " + str(len(df2[df2.funder != ""])/len(df2)) + '\n')
#statfile.write("Non DOI Funder Find Rate: " + str(len(df2[(df2.funder != "") & (df2.doi == False)])/len(df2[df2.doi == False])) +'\n')
#statfile.write("Non DOI Funder Find Rate: "+ str(len(df2[(df2.funder != "") & (df2.doi == False)])/len(df2[df2.doi == False])) + '\n')
#statfile.close()

df2 = df2.drop('found', axis = 1)
df2 = df2.drop('doi', axis = 1)
df2.to_csv('funders.csv')

Find Rate:  0.051771117166212535
DOI Find Rate:  1.0
Non DOI Find Rate:  0.0
Funder Find Rate 0.008174386920980926
DOI Funder Find Rate 0.15789473684210525
Non DOI Funder Find Rate 0.0


In [None]:
if (file['neo4j']['link'] == ''):
    sys.exit()

from neo4j import GraphDatabase

driver = GraphDatabase.driver(file['neo4j']['link'], auth=("neo4j", file['neo4j']['password']))
session = driver.session()

In [None]:
nodes = []
count = 0
for i in df2.title:
    nodes.append(dict({"name": i}))
    count = count + 1

    
funders = []
x = dict()
for i in df2.funder:
    if i != "" and i not in x:
        x[i] = count
        funders.append(dict({"name": i}))
        count = count + 1

relationships = []
for i in range(len(df2)):
    if df2.funder[i] != "":
        y = (df2.title[i], "funded by", df2.funder[i])
        relationships.append(y)

In [None]:
query = """
MATCH (n)
DETACH DELETE n
"""
session.run(query)

In [None]:
for i in nodes:
    session.run("CREATE (:Research {name: $name})", **i)
    
for i in funders:
    session.run("CREATE (:Funder {name: $name})", **i)

In [None]:
for rel in relationships:
    session.run("MATCH (a:Research {name: $a}), (b:Funder {name: $b}) CREATE (a)-[:FUNDEDBY]->(b)", a=rel[0], b=rel[2])

In [None]:
session.close()
driver.close()