In [100]:
import numpy as np
import pandas as pd
from collections import defaultdict

import requests
from bs4 import BeautifulSoup
from re import *
import os
import time
import csv
import os

if not os.path.exists('data/'):
    os.makedirs('data/')


In [None]:
# Get PMCID from PMID
def get_pmcid(pmid):
    base_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        pmcid = soup.find_all('a', {'class' : 'id-link', 'data-ga-action' : 'PMCID'})[0].text.strip()
    except:
        pmcid = None
    return pmcid

print("Closed example: ", get_pmcid("35770940"))
print("Open example: ", get_pmcid("36792646"))

In [101]:
def get_pmids_open(author):
    
    # Parse author name, build first+last
    author = author.split()
    if len(author) > 1:
        aname = author[0]
        for name in author[1:]:
            aname += "%" + name
    else:
        aname = author[0]
        
    #find total number of pages
    base_url= f'https://pubmed.ncbi.nlm.nih.gov/?term={aname}&page='
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pages = soup.find_all('label', {'class' : 'of-total-pages'})
    p = pages[0].text.strip()
    p = p.split()
    pagenum=int(p[1])
    
    # scrape pubmed
    pmids = []
    entries = []
    for i in range(1,pagenum+1): #change # into max num of pages
        URL = f'https://pubmed.ncbi.nlm.nih.gov/?term={aname}&page={i}'
        response = requests.get(URL)
        soup = BeautifulSoup(response.content, 'html.parser')
        pmids += soup.find_all('span', {'class' : 'docsum-pmid'})
        entries += soup.find_all("div", class_='docsum-content')
    
    # build dictionary of id:pmcid
    ids = {}
    for id,entry in zip(pmids,entries):
        pmcid = get_pmcid(id.text.strip())
        
        if pmcid:
            ids[id.text.strip()] = pmcid
        elif search(r"Free",entry.text.strip()): # use regex to search for "Free" in docsum-content
            ids[id.text.strip()] = "open"
        else:
            ids[id.text.strip()] = "closed"
        
    return ids




In [None]:
ids = get_pmids_open("Franz Weber")

In [88]:
# Load you personal API key
apikey = open('apikey.txt', 'r').read()

In [89]:
# Load keywords and create a list of open science categories, appending whether the full text of each paper is available online or not
terms = pd.read_csv('keywords.csv')
categories = terms['category']
category_descriptions = terms['category_description']
categories_unique = np.unique(np.array(categories))
category_descriptions = category_descriptions.unique().tolist()
full_text = 'full_text'
category_descriptions.append(full_text)


In [97]:
#create df with all the unique categories with one row for each paper located:
data = (len(ids), len(category_descriptions))
o_idx_df = pd.DataFrame(np.zeros(data, dtype = np.int8), columns = category_descriptions)
o_idx_df

Unnamed: 0,data_shared,code_relevant,preprint,data_upon_request,code_shared,code_upon_request,preregistration,OSF_exploratory,full_text
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
124,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,0,0,0
127,0,0,0,0,0,0,0,0,0


In [91]:
#Loop through ids, if there is full text available append 1 to the "full text" column of the dara frame
df_list = [0] * len(category_descriptions)
pmcids = []
for i, item in enumerate(ids): 
    if ids[item] == 'closed':
        continue 
    if ids[item] == 'open':
        o_idx_df.loc[[i],['full_text']] = 1
    else:
        pmcids.append(ids[item])


true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
true
['PMC5554302', 'PMC4353871', 'PMC7385183', 'PMC6784493', 'PMC4095581', 'PMC5037725', 'PMC7573145', 'PMC5073642', 'PMC9659376', 'PMC5455478', 'PMC8752505', 'PMC10318054', 'PMC9162343', 'PMC5079482', 'PMC8875550', 'PMC8423363', 'PMC5783937', 'PMC6832025', 'PMC7074310', 'PMC6060436', 'PMC5296751', 'PMC9952497', 'PMC10299220', 'PMC7731226', 'PMC7683412', 'PMC9947381', 'PMC5350124', 'PMC6198759', 'PMC5521800', 'PMC4852286', 'PMC7141038', 'PMC4840488', 'PMC5455435', 'PMC10056849', 'PMC6275050', 'PMC6704863', 'PMC5552080', 'PMC3129225', 'PMC3982283', 'PMC9219867', 'PMC7040842', 'PMC8541071', 'PMC4437296', 'PMC2863978', 'PMC3031557', 'PMC4623986', 'PMC3079696', 'PMC3355499', 'PMC3731688', 'PMC4642719']
     data_shared  code_relevant  preprint  data_upon_request  code_shared  \
0   

In [98]:
#Loop through open science catagories and scrape full text papeps for keyworkds from ketword.csv

db = 'pmc'
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
dict_term = defaultdict(list)
fulfilled_categories = [0] * len(categories_unique)
for j, pmcid in enumerate(pmcids):
    s = '{:s}db={:s}&id={:s}'.format(base, db, pmcid, apikey)
    out = requests.get(s)
    bs = BeautifulSoup(out.content, 'lxml')
    # Check if full text is available; if not - move to the next paper
    full_text_available = not (bs.findAll('sec') == [])
    if full_text_available is True:
        #print("Full text is available")
        for i, categoryInd in enumerate (categories_unique):
            found_keyword = False
            # Loop through specific keywords related to each open-science category
            for term in terms['keyword'][terms['category'] == categoryInd]:
                for s in re.finditer(term, out.text, re.IGNORECASE):
                    o_idx_df.iloc[[j],[i]] = 1
                    found_keyword = True

                # If one keyword is found, stop with searching for this category
                if found_keyword is True:
                    break




Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available
Full text is available


In [99]:
#Print final O-Index data frame
o_idx_df


Unnamed: 0,data_shared,code_relevant,preprint,data_upon_request,code_shared,code_upon_request,preregistration,OSF_exploratory,full_text
0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
124,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,0,0,0
127,0,0,0,0,0,0,0,0,0


In [None]:
#O index calculation 