In [1]:
# To install the official MySQL Connector for Python, please use the name mysql-connector-python:
# pip install mysql-connector-python
import mysql.connector
import datetime
import pandas as pd
import numpy as np
import urllib.request
import json

In [14]:
def load_data(file_name): 
    df = pd.read_csv(file_name)
    df.doi = df.doi.str.lower()
    return df
# search Unpaywall Database
def search_db(db_name, search_df): 
    cnx = mysql.connector.connect(user='scott', password='tiger',
                                  host='127.0.0.1',
                                  database=db_name,
                                  use_pure=False)
    cursor = cnx.cursor()
    search_doi ='\',\''.join(search_df.doi)
    query = ("SELECT doi, is_oa FROM tmp WHERE doi IN (\'{}\')").format(search_doi)
    print ("Executing query:\n" + query + "\n")
    cursor.execute(query)
    result = {doi: 0 if is_oa == 'False' else 1 for (doi, is_oa) in cursor}
    print("# of article found in upw db:{}".format(len(result)))
    return result
# check missing and brocken doi based on the result returned by search_db
def call_crossref_api(list_of_doi):
    crossref_base = 'https://api.crossref.org/works/{}/agency'
    resp, err = [], []
    for doi in list_of_doi:
        try:
            with urllib.request.urlopen(crossref_base.format(doi)) as url:
                resp.append(json.loads(url.read().decode()))
        except (UnicodeEncodeError, urllib.error.HTTPError) as e:
            err.append({doi : 'Code: {c}, Message, {m}'.format(c = type(e).__name__, m = str(e))})
            continue
    return resp, err
# query valid missing doi with unpaywall API
def call_upw_api(list_of_doi):
    upw_base = 'https://api.unpaywall.org/v2/{}?email=hanlin.zhang@unc.edu'
    resp = []
    for doi in list_of_doi:
        with urllib.request.urlopen(upw_base.format(doi)) as url:
            resp.append(json.loads(url.read().decode()))
    return resp
# Add missing query result by calling `call_crossref_apiand` and `call_upw_api`
def search_missing(input_df, upw_db_result):
    diff = list(set(input_df.doi)-set(upw_db_result))
    print('Missing doi:\n{}'.format(diff))
    # Searching missing doi with Crossref
    crsref_resp, crsref_err = call_crossref_api(diff)
    vaild_doi = [json['message']['DOI'] for json in crsref_resp]
    print("# total missing doi: {}".format(len(diff)))
    print("# valid missing doi: {}".format(len(vaild_doi)))
    print("# of brocken missing doi: {}".format(len(crsref_err)))
    # Searching valid doi with Unpaywall API
    upw_resp = call_upw_api(vaild_doi)
    add_result = {json['doi']: 0 if json['is_oa'] != True else 1 for json in upw_resp}
    return add_result
# validate Unpaywall result with Google Scholar
def validate(vaild_df, query_df):
    vaild_df.is_oa = vaild_df.is_oa.astype(bool) *1
    query_df.upw_oa = query_df.upw_oa.astype(bool)*1
    tmp = pd.merge(vaild_df[['doi', 'is_oa']], query_df, on='doi', how='left')
    tmp['check'] = tmp.apply(lambda x : 0 if x['is_oa'] != x['upw_oa'] else 1, axis=1)
    print("*"*18 + "OA Result Summary" + "*"*20 +"\n")
    print("# of articles Google Scholar found OA: {}".format(tmp.is_oa.sum()))
    print("# of articles Google Scholar found NOT OA: {} \n".format(len(vaild_df) - tmp.is_oa.sum()))
    print("# of articles Unpaywall found OA: {}".format(tmp.upw_oa.sum()))
    print("# of articles Unpaywall found NOT OA: {} \n".format(len(vaild_df) - tmp.upw_oa.sum()))
    print("*"*18 + "End OA Result Summary" + "*"*16)
    return tmp
# calculate Precision and Racall
def evaluation(result_df):
    # masks for T&F
    t_mask = result_df.check == 1
    f_mask = result_df.check == 0
    # Get upw OA counts
    upw_oa_count = result_df.upw_oa.sum()
    upw_n_oa_count = len(result_df)- upw_oa_count
    # Get Google Scholar OA counts
    gs_oa_count= result_df.is_oa.sum()
    gs_n_oa_count = len(result_df)- gs_oa_count
    # Set Google Scholar as bench mark, find True-Positive for Unpaywall:
    TP = result_df[t_mask].is_oa.sum()
    print("Within those {} articles Unpaywall found OA:\n" \
      "{} is (are) acutally OA. \n".format(upw_oa_count, TP))
    # Set Google Scholar as bench mark, find True-Negative for Unpaywall:
    TN = len(result_df[t_mask]) - result_df[t_mask].upw_oa.sum()
    print("Within those {} articles Unpaywall NOT found OA:\n" \
      "{} is (are) acutally NOT OA. \n".format(upw_n_oa_count, TN))
    print('Precision = {} / {} = {}%'.format(TP, upw_oa_count, TP/upw_oa_count*100))
    print('Racall = {} / {} = {}%'.format(TP, gs_oa_count, TP/gs_oa_count*100))
    
    return result_df[f_mask]

### JASIST

In [3]:
jasist = load_data('JASIST_sampler - Sheet1.csv')
jasist.head()

Unnamed: 0,doi,title,author,year_published,jasist_vol_Issue,is_oa
0,10.1002/asi.23606,Predicting information searchers' topic knowle...,"Liu, J., Liu, C., & Belkin, N. J.",2016,"Volume67, Issue11",0
1,10.1002/asi.23609,Automated arabic text classification with P‐S ...,"Kanan, T., & Fox, E. A.",2016,"Volume67, Issue11",1
2,10.1002/asi.23612,Predicting the impact of scientific concepts u...,"McKeown, K., Daume III, H., Chaturvedi, S., Pa...",2016,"Volume67, Issue11",1
3,10.1002/asi.23617,Is exploratory search different? A comparison ...,"Athukorala, K., Głowacka, D., Jacucci, G., Oul...",2016,"Volume67, Issue11",1
4,10.1002/asi.23620,Chatting through pictures? A classification of...,"Thelwall, M., Goriunova, O., Vis, F., Faulkner...",2016,"Volume67, Issue11",1


In [4]:
upw_db_result = search_db('saa', jasist)

Executing query:
SELECT doi, is_oa FROM tmp WHERE doi IN ('10.1002/asi.23606','10.1002/asi.23609','10.1002/asi.23612','10.1002/asi.23617','10.1002/asi.23620','10.1002/asi.23621','10.1002/asi.23622','10.1002/asi.23625','10.1002/asi.23628','10.1002/asi.23629','10.1002/asi.23804','10.1002/asi.23805','10.1002/asi.23813','10.1002/asi.23814','10.1002/asi.23815','10.1002/asi.23820','10.1002/asi.23821','10.1002/asi.23834','10.1002/asi.23837','10.1002/asi.23838','10.1002/asi.23844','10.1002/asi.23850','10.1002/asi.23861','10.1002/asi.23862','10.1002/asi.23863','10.1002/asi.23872','10.1002/asi.23915','10.1002/asi.23918','10.1002/asi.23919','10.1002/asi.23925')

# of article found in upw db:27


In [5]:
upw_add_result = search_missing(jasist, upw_db_result)
upw_df = pd.DataFrame(list({**upw_db_result, **upw_add_result}.items()), columns=['doi','upw_oa'])

Missing doi:
['10.1002/asi.23813', '10.1002/asi.23629', '10.1002/asi.23621']
# total missing doi: 3
# valid missing doi: 3
# of brocken missing doi: 0


In [6]:
jasist_result = validate(jasist, upw_df)
jasist_result.head()

******************OA Result Summary********************

# of articles Google Scholar found OA: 20
# of articles Google Scholar found NOT OA: 10 

# of articles Unpaywall found OA: 16
# of articles Unpaywall found NOT OA: 14 

******************End OA Result Summary****************


Unnamed: 0,doi,is_oa,upw_oa,check
0,10.1002/asi.23606,0,0,1
1,10.1002/asi.23609,1,1,1
2,10.1002/asi.23612,1,0,0
3,10.1002/asi.23617,1,1,1
4,10.1002/asi.23620,1,1,1


In [20]:
jasist_diff = evaluation(jasist_result)

Within those 16 articles Unpaywall found OA:
14 is (are) acutally OA. 

Within those 14 articles Unpaywall NOT found OA:
8 is (are) acutally NOT OA. 

Precision = 14 / 16 = 87.5%
Racall = 14 / 20 = 70.0%


In [21]:
jasist_diff

Unnamed: 0,doi,is_oa,upw_oa,check
2,10.1002/asi.23612,1,0,0
6,10.1002/asi.23622,1,0,0
10,10.1002/asi.23804,1,0,0
17,10.1002/asi.23834,1,0,0
18,10.1002/asi.23837,1,0,0
20,10.1002/asi.23844,1,0,0
26,10.1002/asi.23915,0,1,0
27,10.1002/asi.23918,0,1,0


### Han's 50 article sampler

In [8]:
oa_sampler = load_data('oa_atricle_sampler.csv')
oa_sampler.head()

Unnamed: 0,doi,title,author,year_published,field of study,is_oa
0,10.3842/sigma.2012.016,Introduction to loop quantum cosmology\n,"Banerjee, K., Calcagni, G., & Martín-Benito, M.",2010,Physics,1
1,10.1016/j.physrep.2012.01.001,Modified gravity and cosmology,"Clifton, T., Ferreira, P. G., Padilla, A., & S...",2012,Physics,1
2,10.1146/annurev-astro-081811-125615,Cosmic star-formation history,"Madau, P., & Dickinson, M.",2014,Physics,1
3,10.1016/j.physletb.2008.07.018,Review of Particle Physics\n,"C. Amsler, M. Doser, M. Antonelli, D. M. Asner...",2008,Physics,1
4,10.1126/science.1202043,Improved Learning in a Large-Enrollment Physic...,"Deslauriers, L., Schelew, E., & Wieman, C.",2011,Physics,1


In [9]:
upw_db_result = search_db('saa', oa_sampler)

Executing query:
SELECT doi, is_oa FROM tmp WHERE doi IN ('10.3842/sigma.2012.016','10.1016/j.physrep.2012.01.001','10.1146/annurev-astro-081811-125615','10.1016/j.physletb.2008.07.018','10.1126/science.1202043','10.1140/epjc/s10052-016-4099-4','10.1016/j.aop.2005.04.002','10.1103/physrevlett.94.111601','10.1143/ptps.183.1','10.1143/ptp.113.843','10.1371/journal.pone.0064841','10.1108/jd-03-2016-0030','10.1007/s10900-018-0547-4','10.1016/j.ijinfomgt.2018.07.004','10.1016/j.lisr.2017.03.001','10.1016/j.lisr.2017.03.005','10.1002/asi.23124','10.1145/2998181.2998204','10.1108/jd-06-2017-0095','10.1080/10447318.2017.1365459','10.1177/1464884905056815','10.1177/1464884903004001484','10.1080/1461670x.2011.571825','10.1080/17512780802281065','10.1080/17512786.2012.667269','10.1207/s15506878jobem4604_3','10.1177/1461444809341393','10.1177/0016549205057564','10.1080/14616700500533643','10.1080/14616700118394','10.3390/ijerph14091002','10.1097/phh.0000000000000347','10.11604/pamj.supp.2017.27.1.

In [10]:
upw_add_result = search_missing(oa_sampler, upw_db_result)
upw_df = pd.DataFrame(list({**upw_db_result, **upw_add_result}.items()), columns=['doi','upw_oa'])

Missing doi:
['10.1016/j.physletb.2008.07.018', '10.1177/0016549205057564', '10.2753/mis0742-1222240302', '10.1145/330908.331819', '10.1080/17512780802281065', '10.1207/s15506878jobem4604_3', '10.1103/physrevlett.94.111601', '10.1097/phh.0000000000000347']
# total missing doi: 8
# valid missing doi: 8
# of brocken missing doi: 0


In [12]:
oa_result = validate(oa_sampler, upw_df)
oa_result.head()

******************OA Result Summary********************

# of articles Google Scholar found OA: 46
# of articles Google Scholar found NOT OA: 4 

# of articles Unpaywall found OA: 25
# of articles Unpaywall found NOT OA: 25 

******************End OA Result Summary****************


Unnamed: 0,doi,is_oa,upw_oa,check
0,10.3842/sigma.2012.016,1,1,1
1,10.1016/j.physrep.2012.01.001,1,1,1
2,10.1146/annurev-astro-081811-125615,1,1,1
3,10.1016/j.physletb.2008.07.018,1,0,0
4,10.1126/science.1202043,1,1,1


In [18]:
oa_diff = evaluation(oa_result)

Within those 25 articles Unpaywall found OA:
25 is (are) acutally OA. 

Within those 25 articles Unpaywall NOT found OA:
4 is (are) acutally NOT OA. 

Precision = 25 / 25 = 100.0%
Racall = 25 / 46 = 54.347826086956516%


In [19]:
oa_diff

Unnamed: 0,doi,is_oa,upw_oa,check
3,10.1016/j.physletb.2008.07.018,1,0,0
11,10.1108/jd-03-2016-0030,1,0,0
12,10.1007/s10900-018-0547-4,1,0,0
17,10.1145/2998181.2998204,1,0,0
18,10.1108/jd-06-2017-0095,1,0,0
19,10.1080/10447318.2017.1365459,1,0,0
20,10.1177/1464884905056815,1,0,0
21,10.1177/1464884903004001484,1,0,0
24,10.1080/17512786.2012.667269,1,0,0
25,10.1207/s15506878jobem4604_3,1,0,0
