# Define Functions to Get PM(C)IDs and Openness

In [1]:
import requests
from bs4 import BeautifulSoup
from re import *

import numpy as np
import pandas as pd
from collections import defaultdict
import os

In [3]:
# Get PMCID from PMID
def get_pmcid_year(pmid):
    base_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    date = soup.find_all('span', {'class' : 'cit'})[0].text.strip()
    date = split("\D",date)[0]
    try:
        pmcid = soup.find_all('a', {'class' : 'id-link', 'data-ga-action' : 'PMCID'})[0].text.strip()
    except:
        pmcid = None
    return (pmcid , date)

print("Closed example: ", get_pmcid_year("35770940"))
print("Open example: ", get_pmcid_year("35165460"))

Closed example:  (None, '2023')
Open example:  ('PMC7612594', '2022')


In [99]:
# Get list of PMIDs from author name
def get_pmids_open(author):
    
    # Parse author name, build first+last
    author = author.split()
    if len(author) > 1:
        aname = author[0]
        for name in author[1:]:
            aname += "+" + name
    else:
        aname = author[0]
        
    #find total number of pages
    base_url= f'https://pubmed.ncbi.nlm.nih.gov/?term={aname}&page='
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pages = soup.find_all('label', {'class' : 'of-total-pages'})
    p = pages[0].text.strip()
    p = p.split()
    pagenum=int(p[1])
    
    # scrape pubmed
    pmids = []
    entries = []
    for i in range(1,pagenum+1): #change # into max num of pages
        URL = f'https://pubmed.ncbi.nlm.nih.gov/?term={aname}&page={i}'
        response = requests.get(URL)
        soup = BeautifulSoup(response.content, 'html.parser')
        pmids += soup.find_all('span', {'class' : 'docsum-pmid'})
        entries += soup.find_all("div", class_='docsum-content')
    
    # build dictionary of id:pmcid
    ids = {}
    years = {}
    for id,entry in zip(pmids,entries):
        pmcid, year = get_pmcid_year(id.text.strip())
        years[id.text.strip()] = year
        if pmcid:
            ids[id.text.strip()] = pmcid
        elif search(r"Free",entry.text.strip()): # use regex to search for "Free" in docsum-content
            ids[id.text.strip()] = "open"
        else:
            ids[id.text.strip()] = "closed"
        
    return ids, years

print("Jennifer: ", get_pmids_open("jennifer jahncke"))
print("Emily L: ", get_pmids_open("emily lecy"))
print("Arpy: ", get_pmids_open("arpiar saunders"))

Jennifer:  ({'35770940': 'closed', '35944998': 'PMC9480892', '32321746': 'PMC7204083', '37540708': 'open'}, {'35770940': '2023', '35944998': '2022', '32321746': '2020', '37540708': '2023'})
Emily L:  ({'36792646': 'PMC9932154', '36582611': 'PMC9792983', '33906174': 'PMC8504120'}, {'36792646': '2023', '36582611': '2022', '33906174': '2021'})
Arpy:  ({'30471926': 'PMC6655561', '30096299': 'PMC6447408', '34197733': 'PMC8376805', '32999462': 'PMC7957574', '29632380': 'PMC5896795', '26220313': 'PMC4584188', '26131660': 'PMC4545963', '22660328': 'PMC3367801', '36384944': 'PMC9668842', '33230336': 'closed', '26551563': 'PMC4716836', '32613945': 'PMC7360370', '25723967': 'PMC4371381', '26905595': 'PMC4764347', '25739505': 'PMC4425585', '21825165': 'PMC3174680', '28384468': 'PMC5439268', '22866029': 'PMC3406316', '19056989': 'closed', '23403489': 'PMC3566411', '22325203': 'PMC3278709', '26104011': 'closed'}, {'30471926': '2019', '30096299': '2018', '34197733': '2021', '32999462': '2020', '29632

In [118]:
def get_openness(author, api):
    ids, years = get_pmids_open(author)
    
    apikey = open(api, 'r').read()
    
  # Load keywords and create open-science categories
    keyword_df = pd.read_csv('keywords.csv')
    categoryIDs = np.unique(np.array(keyword_df['category']))
    category_descriptions = keyword_df['category_description']
    category_descriptions = category_descriptions.unique().tolist()
    full_text = 'full_text'
    category_descriptions.append(full_text)
    
    #create df with all the unique categories:
    #data = [[None] * len(ids) for _ in range(len(category_descriptions))]
    data = (len(ids), len(category_descriptions))

    #data = (len(ids), len(category_descriptions))
    o_idx_df = pd.DataFrame(np.zeros(data), columns = category_descriptions)
    #deleting the 'code relevant column from the final df
    pmcids = []
    for i, item in enumerate(ids): 
        o_idx_df.loc[[i],['pmid']] = item
        o_idx_df.loc[[i],['year']] = years[item]
        if ids[item] == 'closed':
            pmcids.append(None)
            #if PMCID is unavailable make items in df None type
            o_idx_df.iloc[[i],0:2] = None
            continue
        if ids[item] == 'open':
            pmcids.append(None)
            o_idx_df.loc[[i],['full_text']] = 1
            o_idx_df.iloc[[i],0:2] = None

        else:
            pmcids.append(ids[item])
            o_idx_df.loc[[i],['full_text']] = 1
    db = 'pmc'
    base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
    for j, pmcid in enumerate(pmcids):
        if pmcid == None:
            continue
        s = '{:s}db={:s}&id={:s}'.format(base, db, pmcid, apikey)
        out = requests.get(s)
        bs = BeautifulSoup(out.content, features="xml")
        # Check if full text is available; if not - move to the next paper
        for cat in categoryIDs:
            found_keyword = False
            # Loop through specific keywords related to each open-science category
            for k, keyword in enumerate(keyword_df['keyword'][keyword_df['category'] == cat]):
                for s in finditer(keyword, out.text, IGNORECASE):
                    #if we are on cat "code relevant" (cat 3) we are testing to see if code is relevant for this paper
                    o_idx_df.iloc[[j],[cat-1]] = keyword_df.loc[k]["weight"]
                    found_keyword = True

                # If one keyword is found, stop with searching for this category
                if found_keyword is True:
                    break
            #if code category is 0, check to see if code is relevant
            if cat == 2 and found_keyword== True:
                break 
            if cat == 3 and found_keyword == False:
                o_idx_df.iloc[[j],[cat-2]] = None
                break    
                
    del o_idx_df['code_relevant']
        
    o_idx_df.loc[:,'Score'] = o_idx_df.mean(numeric_only=True, axis=1)
   
    OIndex=o_idx_df["Score"].mean()
    print(OIndex)

    return o_idx_df

def oindex(df):
    OIndex = df["o-score"].mean()
    
    

get_openness("jason early", "../apikey.txt")

0.7037037037037037


Unnamed: 0,data,code,full_text,pmid,year,Score
0,0.0,,1.0,35165460,2022,0.5
1,,,1.0,28618073,2017,1.0
2,1.0,1.0,1.0,32364583,2020,1.0
3,,,0.0,34866633,2021,0.0
4,1.0,,1.0,33296680,2021,1.0
5,0.0,1.0,1.0,34270947,2021,0.666667
6,1.0,1.0,1.0,29979149,2018,1.0
7,0.0,1.0,1.0,29628374,2018,0.666667
8,0.0,,1.0,28542521,2017,0.5
