In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
# Load OA papers that cite AF levels 0-3
# Currently using no_mesh
# When mesh has been added use "oa.data_processing.depth.mesh.{level}.intermediate" (make note in PR)
level_0_primary = catalog.load("oa.data_processing.depth.no_mesh.0.intermediate")
level_1_primary = catalog.load("oa.data_processing.depth.no_mesh.1.intermediate")
level_2_primary = catalog.load("oa.data_processing.depth.no_mesh.2.intermediate")
level_3_primary = catalog.load("oa.data_processing.depth.no_mesh.3.intermediate")

In [3]:
# Combine OA papers that cite AF levels 0-3
af_oa_data = pd.concat([level_0_primary, level_1_primary, level_2_primary, level_3_primary], axis=0)

In [82]:
# Drop duplicates 
af_oa_data = af_oa_data.drop_duplicates(subset=["doi", "level"])

In [83]:
# Display af_oa_data
af_oa_data

Unnamed: 0,parent_id,id,level,doi,publication_date,mesh_terms,cited_by_count
0,W3177828909,W3211795435,0,10.1093/nar/gkab1061,2021-11-17,"[[D030562, Databases, Protein], [D017510, Prot...",3437
1,W3177828909,W4281790889,0,10.1038/s41592-022-01488-1,2022-05-30,"[[D017510, Protein Folding], [D012984, Softwar...",3021
2,W3177828909,W3183475563,0,10.1038/s41586-021-03828-1,2021-07-22,"[[D019295, Computational Biology], [D000077321...",1688
3,W3177828909,W3202105508,0,10.1101/2021.10.04.463034,2021-10-04,,1241
4,W3177828909,W4206153788,0,10.1016/j.cell.2021.12.046,2022-02-01,,775
...,...,...,...,...,...,...,...
0,W4391997926,W4391997979,3,10.2903/sp.efsa.2024.en-8597,2024-02-01,,0
0,W4392058636,W4392041217,3,10.1103/physrevlett.132.080402,2024-02-22,,0
0,W4392192985,W4392198228,3,10.1038/s42255-024-01013-y,2024-02-27,,0
0,W4392198237,W4392198357,3,10.1038/s41588-023-01646-x,2024-02-27,,0


In [5]:
# Create list of dois of OA papers that cite AF levels 0-3 to match on
af_oa_dois = af_oa_data.doi.to_list()

In [6]:
# Load icite data in chunks to find matches using doi to OA papers that cite AF levels 0-3
doi_matches = []

for chunk in pd.read_csv("/Users/jr/Downloads/icite/icite_metadata.csv", chunksize=1_000_000):
    current_matches = chunk.query(f"doi in {af_oa_dois}")
    doi_matches.append(current_matches)

In [7]:
# Combine icite papers that cite AF levels 0-3
icite_matches = pd.concat(doi_matches)

In [8]:
# Check how many papers have been lost from OA to icite
print(f"{round(len(icite_matches) / len(af_oa_data) * 100, 2)}% papers matched on doi")

24.57% papers matched on doi


In [9]:
# Fix some cited_by_clin values being recorded as floats (this is stopped their pmid being looked up
# through biopython and not counting num_cited_by_clin correctly)

def convert_single_pubmed_id_to_int_string(value):
    """
    Converts a value in the 'cited_by_clin' column to the correct format.

    If the value is a single number (float), it's converted to an int then to a string.
    If the value is a string with multiple numbers, it's left as is.
    NaN values are left unchanged.

    Args:
        value (float, str, or nan): The original value in the 'cited_by_clin' column.

    Returns:
        str or nan: The converted value.
    """
    # Check if the value is a single float (a single PubMed ID)
    if isinstance(value, float) and not pd.isna(value):
        return str(int(value))
    # Return the value unchanged if it's a string (multiple PubMed IDs) or NaN
    return value

icite_matches['cited_by_clin'] = icite_matches['cited_by_clin'].apply(convert_single_pubmed_id_to_int_string)

In [10]:
# Add column for number of times cited by clinical trial
icite_matches['num_cited_by_clin'] = (
    icite_matches.cited_by_clin
    .str.count(' ')
    .add(1)
    .fillna(0, downcast='infer')
)

In [11]:
from Bio import Entrez
from typing import Optional, List
import logging
from datetime import datetime
import numpy as np

def convert_month_abbreviation_to_number(month_abbr: str) -> str:
    """
    Convert a three-letter month abbreviation to a two-digit month number.

    Args:
        month_abbr (str): A three-letter abbreviation of a month (e.g., 'Jan', 'Feb', etc.)

    Returns:
        str: The two-digit month number corresponding to the abbreviation.
    """
    if month_abbr == 'XX':
        return 'XX'
    return datetime.strptime(month_abbr, '%b').strftime('%m')


def get_publication_date(pmid: str) -> str:
    """
    Fetches the publication date of a PubMed article using its PMID.
    Returns np.nan if the date cannot be retrieved.

    Args:
        pmid (str): The PubMed ID of the article.

    Returns:
        str: The publication date of the article in YYYY-MM-DD format, or np.nan.
    """
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        records = Entrez.read(handle)
        handle.close()
 
        date = records['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        year = date.get('Year', 'XXXX')
        month = convert_month_abbreviation_to_number(date.get('Month', 'XX'))
        day = date.get('Day', 'XX')
        return f"{year}-{month}-{day}"
    except Exception:
        return np.nan


def get_pub_dates_for_pmids(pmids: str) -> List[str]:
    """
    Fetches publication dates for a list of PubMed IDs.
    Skips if the input is NaN or non-string.

    Args:
        pmids (str): A string containing PubMed IDs separated by spaces.

    Returns:
        List[str]: A list of publication dates in YYYY-MM-DD format or np.nan.
                  Returns an empty list if pmids is NaN or non-string.
    """
    if pd.isna(pmids) or not isinstance(pmids, str):
        return np.nan
    return [get_publication_date(pmid) for pmid in pmids.split()]

In [12]:
# Trying out single publication date retrieval using one pmid
pmid = "33422626"
print(get_publication_date(pmid))

2021-05-XX


In [13]:
# Trying out publication date retrieval for multiple pmids
get_pub_dates_for_pmids('33422626 34153729 34139335 35598005 34535158 35907774 33634996 36645234 35488261 34493750 36272908 35727348')


[1m[[0m
    [32m'2021-05-XX'[0m,
    [32m'2021-08-XX'[0m,
    [32m'2021-10-XX'[0m,
    [32m'2022-05-21'[0m,
    [32m'2021-09-17'[0m,
    [32m'2023-02-XX'[0m,
    [32m'2021-02-XX'[0m,
    [32m'2023-09-01'[0m,
    [32m'2022-04-29'[0m,
    [32m'2021-09-07'[0m,
    [32m'2023-01-XX'[0m,
    [32m'2022-07-XX'[0m
[1m][0m

In [14]:
# Add clinial article publication dates
icite_matches['cited_by_clin_pub_dates'] = icite_matches['cited_by_clin'].apply(get_pub_dates_for_pmids)

In [15]:
# Find records that don't have dates when they have a clin reference
icite_matches.query("cited_by_clin.notna()").query("cited_by_clin_pub_dates.isna()")

Unnamed: 0,pmid,doi,title,authors,year,journal,is_research_article,citation_count,field_citation_rate,expected_citations_per_year,citations_per_year,relative_citation_ratio,nih_percentile,human,animal,molecular_cellular,x_coord,y_coord,apt,is_clinical,cited_by_clin,cited_by,references,provisional,last_modified,num_cited_by_clin,cited_by_clin_pub_dates


In [34]:
# Check where XX is typically given in the month
missing_pubmed_date_counts = {'year': 0, 'month': 0, 'day': 0}

for date_list in icite_matches.query("cited_by_clin_pub_dates.notna()").cited_by_clin_pub_dates.values:
    for date in date_list:
        if isinstance(date, str):
            if date[0:4] == 'XXXX':
                missing_pubmed_date_counts['year'] += 1
            if date[5:7] == 'XX':
                missing_pubmed_date_counts['month'] += 1
            if date[8:10] == 'XX':
                missing_pubmed_date_counts['day'] += 1

missing_pubmed_date_counts

[1m{[0m[32m'year'[0m: [1;36m7[0m, [32m'month'[0m: [1;36m527[0m, [32m'day'[0m: [1;36m2236[0m[1m}[0m

In [35]:
# Number of clinical articles that cite a paper in the AF citation chain
icite_matches.num_cited_by_clin.sum()

[1;36m4162[0m

In [42]:
# Replace XX missing dates with random number
import pandas as pd
import numpy as np
import re
import random

def replace_missing_dates(df: pd.DataFrame, dates_col: str, rand_dates_col) -> pd.DataFrame:
    """
    Replace dates with missing components in a DataFrame column.

    Args:
        dataframe (pd.DataFrame): The DataFrame containing the date column.
        dates_col (str): The name of the column with dates.
        rand_dates_col (str): The name of the new column which
            will store the dates with XX replaced with random dates.

    Returns:
        pd.DataFrame: The DataFrame with modified dates.
    """
    def replace_date(date: str) -> str:
        if 'XXXX' == date[:4]:
            return np.nan
        month = random.choice([f'{i:02d}' for i in range(1, 13)]) if 'XX' in date[5:7] else date[5:7]
        day = random.choice([f'{i:02d}' for i in range(1, 29)]) if 'XX' in date[8:10] else date[8:10]
        return f"{date[:4]}-{month}-{day}"

    df[rand_dates_col] = df[dates_col].apply(
        lambda x: [replace_date(date) if isinstance(date, str) else np.nan for date in x] if isinstance(x, list) else np.nan
    )

    return df

icite_matches = replace_missing_dates(icite_matches, "cited_by_clin_pub_dates", "cited_by_clin_pub_dates_rand")

In [43]:
# Check an example with missing date info
icite_matches.cited_by_clin_pub_dates.values[0]


[1m[[0m
    [32m'2021-05-XX'[0m,
    [32m'2021-08-XX'[0m,
    [32m'2021-10-XX'[0m,
    [32m'2022-05-21'[0m,
    [32m'2021-09-17'[0m,
    [32m'2023-02-XX'[0m,
    [32m'2021-02-XX'[0m,
    [32m'2023-09-01'[0m,
    [32m'2022-04-29'[0m,
    [32m'2021-09-07'[0m,
    [32m'2023-01-XX'[0m,
    [32m'2022-07-XX'[0m
[1m][0m

In [44]:
# Check same example with missing date info replaced with random dates
icite_matches.cited_by_clin_pub_dates_rand.values[0]


[1m[[0m
    [32m'2021-05-04'[0m,
    [32m'2021-08-12'[0m,
    [32m'2021-10-02'[0m,
    [32m'2022-05-21'[0m,
    [32m'2021-09-17'[0m,
    [32m'2023-02-19'[0m,
    [32m'2021-02-24'[0m,
    [32m'2023-09-01'[0m,
    [32m'2022-04-29'[0m,
    [32m'2021-09-07'[0m,
    [32m'2023-01-05'[0m,
    [32m'2022-07-04'[0m
[1m][0m

In [57]:
# Find earliest clinical article publication date
import pandas as pd
from typing import List, Union
from datetime import datetime

def find_earliest_date(dates: Union[List[str], float]) -> Union[str, float]:
    """
    Finds the earliest date in a list of dates.

    Args:
        dates (Union[List[str], float]): A list of date strings or nan.

    Returns:
        Union[str, float]: The earliest date in string format or nan.
    """
    if isinstance(dates, list):
        dates = [date for date in dates if pd.notna(date)]
    else:
        return np.nan
    if dates == []:
        return np.nan
    datetime_list = pd.to_datetime(dates, errors='coerce')
    min_date = min(datetime_list)
    return min_date.strftime('%Y-%m-%d') if pd.notna(min_date) else np.nan

icite_matches['cited_by_clin_earliest_pub_date'] = icite_matches['cited_by_clin_pub_dates_rand'].apply(find_earliest_date)

In [58]:
# Find cases where `cited_by_clin_earliest_pub_date` is na but 
# `cited_by_clin_pub_dates` is not
icite_matches.query("cited_by_clin_pub_dates.notna()").query("cited_by_clin_earliest_pub_date.isna()")

Unnamed: 0,pmid,doi,title,authors,year,journal,is_research_article,citation_count,field_citation_rate,expected_citations_per_year,citations_per_year,relative_citation_ratio,nih_percentile,human,animal,molecular_cellular,x_coord,y_coord,apt,is_clinical,cited_by_clin,cited_by,references,provisional,last_modified,num_cited_by_clin,cited_by_clin_pub_dates,cited_by_clin_pub_dates_rand,cited_by_clin_earliest_pub_date
33799437,35265944,10.1093/immadv/ltab025,Seven mysteries of LAG-3: a multi-faceted immu...,"Stephanie E A Burnell, Lorenzo Capitani, Bruce...",2021,Immunother Adv,False,23,11.827166,4.104216,7.666667,1.87,72.5,0.0,0.5,0.5,0.0,-0.5,0.75,False,37921680,36824824 38000024 37921680 38026700 37004702 3...,15381730 31847878 20421648 31665515 29282307 2...,No,"01/28/2024, 13:32:43",1,[XXXX-XX-XX],[nan],
35533557,37004702,10.1007/s11912-023-01406-4,The Latest Option: Nivolumab and Relatlimab in...,"Lea Jessica Albrecht, Elisabeth Livingstone, L...",2023,Curr Oncol Rep,False,4,12.694714,,4.0,,,1.0,0.0,0.0,0.0,1.0,0.25,False,37921680,38022646 37921680 37511453 37332039,31160251 27533448 29127120 34183437 23851140 3...,No,"01/28/2024, 12:28:19",1,[XXXX-XX-XX],[nan],
35601436,37072748,10.1186/s12967-023-04100-y,"The ""Great Debate"" at Melanoma Bridge 2022, Na...","Paolo A Ascierto, Christian Blank, Alexander M...",2023,J Transl Med,True,1,21.218139,,1.0,,,1.0,0.0,0.0,0.0,1.0,0.5,False,37921680,37921680,28591523 31160251 33857412 29477665 36423526 3...,No,"01/28/2024, 12:31:54",1,[XXXX-XX-XX],[nan],


In [84]:
# Calculate number of days between publication date of paper to earliest 
# publication date of clinical article where paper has been cited
icite_oa_comb = icite_matches.merge(af_oa_data, how="left", on="doi")
icite_oa_comb['cited_by_clin_earliest_pub_date'] = pd.to_datetime(icite_oa_comb['cited_by_clin_earliest_pub_date'])
icite_oa_comb['publication_date'] = pd.to_datetime(icite_oa_comb['publication_date'])

# Calculate the difference in days
icite_oa_comb['days_to_clinical_trial'] = (
    icite_oa_comb['cited_by_clin_earliest_pub_date'] - icite_oa_comb['publication_date']
).dt.days

In [85]:
af_oa_data.doi.value_counts()


[1;36m10.1093[0m/genetics/iyad134       [1;36m4[0m
[1;36m10.3390[0m/eng5010006             [1;36m4[0m
[1;36m10.1093[0m/femsre/fuad060         [1;36m4[0m
[1;36m10.3389[0m/fpls.[1;36m2023.1237722[0m      [1;36m4[0m
[1;36m10.3390[0m/catal13030591          [1;36m4[0m
                              ..
[1;36m10.3390[0m/antibiotics13010080    [1;36m1[0m
[1;36m10.1021[0m/acs.jctc.3c00770       [1;36m1[0m
[1;36m10.3390[0m/molecules28041933      [1;36m1[0m
[1;36m10.1039[0m/d3ya00149k             [1;36m1[0m
[1;36m10.1242[0m/jcs.[1;36m261999[0m             [1;36m1[0m
Name: doi, Length: [1;36m218987[0m, dtype: int64

In [86]:
af_oa_data.query("doi == '10.1093/genetics/iyad134'")

Unnamed: 0,parent_id,id,level,doi,publication_date,mesh_terms,cited_by_count
8007,W3177828909,W4384819068,0,10.1093/genetics/iyad134,2023-07-19,"[[D012441, Saccharomyces cerevisiae], [D029701...",1
1639,W3211795435,W4384819068,1,10.1093/genetics/iyad134,2023-07-19,"[[D012441, Saccharomyces cerevisiae], [D029701...",1
1639,W3211795435,W4384819068,2,10.1093/genetics/iyad134,2023-07-19,"[[D012441, Saccharomyces cerevisiae], [D029701...",1
12,W4315645064,W4384819068,3,10.1093/genetics/iyad134,2023-07-19,"[[D012441, Saccharomyces cerevisiae], [D029701...",1


In [71]:
# Check cases where the time to clinical trial is negative
icite_oa_comb.query("days_to_clinical_trial < 0")

Unnamed: 0,pmid,doi,title,authors,year,journal,is_research_article,citation_count,field_citation_rate,expected_citations_per_year,citations_per_year,relative_citation_ratio,nih_percentile,human,animal,molecular_cellular,x_coord,y_coord,apt,is_clinical,cited_by_clin,cited_by,references,provisional,last_modified,num_cited_by_clin,cited_by_clin_pub_dates,cited_by_clin_pub_dates_rand,cited_by_clin_earliest_pub_date,parent_id,id,level,publication_date,mesh_terms,cited_by_count,days_to_clinical_trial
0,33129791,10.1016/j.chest.2020.10.054,Impact of Corticosteroids in Coronavirus Disea...,"Edison J Cano, Xavier Fonseca Fuentes, Cristin...",2021,Chest,True,122,8.269781,2.932214,40.666667,13.87,98.9,0.50,0.00,0.50,-0.433013,0.2500,0.95,False,33422626 34153729 34139335 35598005 34535158 3...,35880006 36819444 36243331 35136768 35764344 3...,32396996 32619760 32390367 32582740 32075786 3...,No,"01/28/2024, 12:07:34",12,"[2021-05-XX, 2021-08-XX, 2021-10-XX, 2022-05-2...","[2021-05-04, 2021-08-12, 2021-10-02, 2022-05-2...",2021-02-24,W4294636903,W3110379274,3,2021-03-01,"[[D000086382, COVID-19], [D000093485, COVID-19...",143,-5.0
47,33903131,10.1136/bmj.n949,Prophylaxis against covid-19: living systemati...,"Jessica J Bartoszko, Reed A C Siemieniuk, Elen...",2021,BMJ,True,67,9.359802,3.291328,22.333333,6.79,95.9,0.50,0.00,0.50,-0.433013,0.2500,0.95,False,33649077,36047644 36558489 37634703 35958161 35730623 3...,33289973 34931202 36385690 25600106 33284679 2...,No,"01/28/2024, 12:41:42",1,[2021-03-01],[2021-03-01],2021-03-01,W4226525051,W3158131439,2,2021-04-26,"[[D000086382, COVID-19], [D002351, Carrageenan...",76,-56.0
48,33903131,10.1136/bmj.n949,Prophylaxis against covid-19: living systemati...,"Jessica J Bartoszko, Reed A C Siemieniuk, Elen...",2021,BMJ,True,67,9.359802,3.291328,22.333333,6.79,95.9,0.50,0.00,0.50,-0.433013,0.2500,0.95,False,33649077,36047644 36558489 37634703 35958161 35730623 3...,33289973 34931202 36385690 25600106 33284679 2...,No,"01/28/2024, 12:41:42",1,[2021-03-01],[2021-03-01],2021-03-01,W3133718957,W3158131439,3,2021-04-26,"[[D000086382, COVID-19], [D002351, Carrageenan...",76,-56.0
49,33903131,10.1136/bmj.n949,Prophylaxis against covid-19: living systemati...,"Jessica J Bartoszko, Reed A C Siemieniuk, Elen...",2021,BMJ,True,67,9.359802,3.291328,22.333333,6.79,95.9,0.50,0.00,0.50,-0.433013,0.2500,0.95,False,33649077,36047644 36558489 37634703 35958161 35730623 3...,33289973 34931202 36385690 25600106 33284679 2...,No,"01/28/2024, 12:41:42",1,[2021-03-01],[2021-03-01],2021-03-01,W4226525051,W3158131439,3,2021-04-26,"[[D000086382, COVID-19], [D002351, Carrageenan...",76,-56.0
109,34201767,10.3390/v13071211,SARS-CoV-2 Variants: A Synopsis of In Vitro Ef...,"Daniele Focosi, Marco Tuccori, Andreina Baj, F...",2021,Viruses,False,29,13.872805,4.778164,9.666667,2.02,75.0,0.50,0.00,0.50,-0.433013,0.2500,0.75,False,34691078 35196802,34691078 34260956 34535792 35023163 34964565 3...,35087066 33857453 34119826 34051887 33948956 3...,No,"01/28/2024, 12:58:12",2,"[2021-XX-XX, 2022-02-23]","[2021-06-03, 2022-02-23]",2021-06-03,W3135758766,W3177020296,3,2021-06-23,"[[D000911, Antibodies, Monoclonal], [D000914, ...",35,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198599,37653342,10.1038/s41591-023-02517-y,Original SARS-CoV-2 monovalent and Omicron BA....,"Spyros Chalkias, Jordan L Whatley, Frank Eder,...",2023,Nat Med,True,4,22.397057,,4.000000,,,0.67,0.00,0.33,-0.288675,0.5000,0.75,True,37799724,37799724 37932355 38112071 38168473,34731553 37011668 36265510 35785530 35714668 3...,No,"01/28/2024, 13:06:12",1,[2023-XX-XX],[2023-05-22],2023-05-22,W4362522950,W4386324890,3,2023-08-31,"[[D000086382, COVID-19], [D000086663, COVID-19...",9,-101.0
198600,37653342,10.1038/s41591-023-02517-y,Original SARS-CoV-2 monovalent and Omicron BA....,"Spyros Chalkias, Jordan L Whatley, Frank Eder,...",2023,Nat Med,True,4,22.397057,,4.000000,,,0.67,0.00,0.33,-0.288675,0.5000,0.75,True,37799724,37799724 37932355 38112071 38168473,34731553 37011668 36265510 35785530 35714668 3...,No,"01/28/2024, 13:06:12",1,[2023-XX-XX],[2023-05-22],2023-05-22,W4381194848,W4386324890,3,2023-08-31,"[[D000086382, COVID-19], [D000086663, COVID-19...",9,-101.0
200962,37666466,10.1016/j.jdent.2023.104690,Calvaria defect regeneration via human periodo...,"Zeqing Zhao, Yaxi Sun, Qingchen Qiao, Michael ...",2023,J Dent,True,1,3.395063,,1.000000,,,0.12,0.38,0.50,-0.108253,-0.3125,0.50,False,38126565,38126565,33021062 33065287 26961805 30718418 28714276 3...,No,"01/28/2024, 13:07:02",1,[2023-XX-XX],[2023-01-22],2023-01-22,W4378575054,W4386422543,3,2023-11-01,"[[D010513, Periodontal Ligament], [D054457, Ti...",2,-283.0
211393,37725432,10.2196/45767,Using Social Media to Help Understand Patient-...,"Elham Dolatabadi, Diana Moyano, Michael Bales,...",2023,J Med Internet Res,True,1,7.372108,,1.000000,,,1.00,0.00,0.00,0.000000,1.0000,0.50,False,37569003,37569003,33789877 32936777 36639608 34737325 32665317 3...,No,"01/28/2024, 13:10:15",1,[2023-07-27],[2023-07-27],2023-07-27,W4310459980,W4379469539,2,2023-09-19,"[[D000086382, COVID-19], [D061108, Social Medi...",3,-54.0


In [None]:
# Print dois when there is no match to check format

In [25]:
# Create list of dois of OA papers that cite AF levels 0-3 to match on (separate for each level)
af_oa_dois_0 = af_oa_data.query("level == 0").doi.to_list()
af_oa_dois_1 = af_oa_data.query("level == 1").doi.to_list()
af_oa_dois_2 = af_oa_data.query("level == 2").doi.to_list()
af_oa_dois_3 = af_oa_data.query("level == 3").doi.to_list()

In [None]:
# Load icite data in chunks to find matches using doi to OA papers that cite AF levels 0-3 (separately)
doi_matches_0 = []
doi_matches_1 = []
doi_matches_2 = []
doi_matches_3 = []

for chunk in pd.read_csv("/Users/jr/Downloads/icite/icite_metadata.csv", chunksize=1_000_000):
    current_matches_0 = chunk.query(f"doi in {af_oa_dois_0}")
    doi_matches_0.append(current_matches_0)
    
    current_matches_1 = chunk.query(f"doi in {af_oa_dois_1}")
    doi_matches_1.append(current_matches_1)

    current_matches_2 = chunk.query(f"doi in {af_oa_dois_2}")
    doi_matches_2.append(current_matches_2)

    current_matches_3 = chunk.query(f"doi in {af_oa_dois_3}")
    doi_matches_3.append(current_matches_3)

In [27]:
# Combine icite papers that cite AF levels 0-3 (separately)
icite_matches_0 = pd.concat(doi_matches_0)
icite_matches_1 = pd.concat(doi_matches_1)
icite_matches_2 = pd.concat(doi_matches_2)
icite_matches_3 = pd.concat(doi_matches_3)

In [28]:
# Check how many papers have been lost from OA to icite
print(f"{round(len(icite_matches_0) / len(af_oa_dois_0) * 100, 2)}% papers matched on doi level 0")
print(f"{round(len(icite_matches_1) / len(af_oa_dois_1) * 100, 2)}% papers matched on doi level 1")
print(f"{round(len(icite_matches_2) / len(af_oa_dois_2) * 100, 2)}% papers matched on doi level 2")
print(f"{round(len(icite_matches_3) / len(af_oa_dois_3) * 100, 2)}% papers matched on doi level 3")

65.61% papers matched on doi level 0
39.63% papers matched on doi level 1
37.77% papers matched on doi level 2
33.19% papers matched on doi level 3


In [90]:
# Check clinical citation percentage in each level
print(len(icite_matches_0.query("cited_by_clin.notna()")) / len(icite_matches_0) * 100)
print(len(icite_matches_1.query("cited_by_clin.notna()")) / len(icite_matches_1) * 100)
print(len(icite_matches_2.query("cited_by_clin.notna()")) / len(icite_matches_2) * 100)
print(len(icite_matches_3.query("cited_by_clin.notna()")) / len(icite_matches_3) * 100)
# Assumption that papers further down the citation chain are more applied seems to
# ring true as the % of papers cited in a clinical trials increases

0.2999454644610071
0.9003303964757708
1.747880585729035
2.2957662492546214


In [None]:
# Run matches using pubmed ids (use new version of oa level data)