In [6]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [7]:
# Load papers that cite AlphaFold and prepare data to be matched
papers_that_cite_af = catalog.load("oa.data_collection.direction.cites.raw")['W3177828909']
papers_that_cite_af = (
    pd.DataFrame(papers_that_cite_af())
    .assign(pmid=lambda df: df.ids.apply(lambda x: x.get('pmid', np.nan)))
    .drop(columns=["grants", "ids"])
    .assign(doi=lambda df: df['doi'].str.replace('^https://doi.org/', '', regex=True))
    .assign(pmid=lambda df: df['pmid'].str.replace('^https://pubmed.ncbi.nlm.nih.gov/', '', regex=True))
    .astype({"pmid": float})
)

In [8]:
# Load trials that have links to papers
clinical_trials_links_to_papers = catalog.load("nih.data_processing.clinical_trials_links_to_papers.intermediate")

In [9]:
# Find matches based on doi
doi_matches = (
    papers_that_cite_af
    .dropna(subset=['doi'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_doi']), how="inner", left_on="doi", right_on="ref_doi")
)

In [11]:
# Find matches based on pmid
pmid_matches = (
    papers_that_cite_af
    .dropna(subset=['pmid'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_pmid']), how="inner", left_on="pmid", right_on="ref_pmid")
)

In [13]:
# Concat pmid and doi matches and removes duplicates
papers_that_cite_af_cited_in_ct = pd.concat([doi_matches, pmid_matches]).drop_duplicates(subset="id")

In [14]:
# Display papers that cite AlphaFold that were cited in a clinical trial
papers_that_cite_af_cited_in_ct

Unnamed: 0,id,doi,display_name,title,publication_date,abstract,authorships,cited_by_count,concepts,keywords,referenced_works,pmid,nct_id,ref_pmid,ref_doi,ref_citation,ref_type,ref_retraction_list
0,https://openalex.org/W4207015910,10.3390/nu14020274,Role of Probiotics in the Management of COVID-...,Role of Probiotics in the Management of COVID-...,2022-01-10,Coronavirus disease 2019 (COVID-19) was declar...,"[{'author_position': 'first', 'author': {'id':...",37,"[{'id': 'https://openalex.org/C2777165150', 'w...","[{'keyword': 'probiotics', 'score': 0.6673}]","[https://openalex.org/W1031578623, https://ope...",35057455.0,NCT05474144,35057455.0,10.3390/nu14020274,"Nguyen QV, Chong LC, Hor YY, Lew LC, Rather IA...",background,
1,https://openalex.org/W4307297657,10.1016/j.ejmech.2022.114861,Ferroptosis-related small-molecule compounds i...,Ferroptosis-related small-molecule compounds i...,2022-12-01,Ferroptosis is a novel type of regulated cell ...,"[{'author_position': 'first', 'author': {'id':...",15,"[{'id': 'https://openalex.org/C185592680', 'wi...","[{'keyword': 'cancer therapy', 'score': 0.3574...","[https://openalex.org/W323406638, https://open...",36332549.0,NCT06134388,36332549.0,10.1016/j.ejmech.2022.114861,"Yin L, Liu P, Jin Y, Ning Z, Yang Y, Gao H. Fe...",background,
2,https://openalex.org/W4311811125,10.1186/s13023-022-02592-3,"NGLY1 deficiency: estimated incidence, clinica...","NGLY1 deficiency: estimated incidence, clinica...",2022-12-17,Abstract Purpose NGLY1 Deficiency is an ultra-...,"[{'author_position': 'first', 'author': {'id':...",6,"[{'id': 'https://openalex.org/C61511704', 'wik...","[{'keyword': 'ngly1 registry', 'score': 0.578}...","[https://openalex.org/W1803309642, https://ope...",36528660.0,NCT06199531,36528660.0,10.1186/s13023-022-02592-3,"Stanclift CR, Dwight SS, Lee K, Eijkenboom QL,...",background,
3,https://openalex.org/W4366395412,10.1093/narcan/zcad017,<b>HiTAIC: hi</b>erarchical tumor artificial i...,<b>HiTAIC: hi</b>erarchical tumor artificial i...,2023-03-11,Human cancers are heterogenous by their cell c...,"[{'author_position': 'first', 'author': {'id':...",1,"[{'id': 'https://openalex.org/C190727270', 'wi...","[{'keyword': 'tumor type', 'score': 0.487}, {'...","[https://openalex.org/W1974029385, https://ope...",37089814.0,NCT06140992,37089814.0,10.1093/narcan/zcad017,"Zhang Z, Lu Y, Vosoughi S, Levy JJ, Christense...",result,
