In [1]:
%load_ext kedro.ipython

In [2]:
import numpy as np
import pandas as pd

In [3]:
alphafold_data_dup = catalog.load("oa.data_processing.depth.duplicated.primary")

In [4]:
alphafold_data_all = catalog.load("oa.data_processing.depth.intermediate")

In [8]:
# merge the strength of alphafold_data_dup in alphafold_data_all using parent_id and child_id
alphafold_data_all = alphafold_data_all.merge(alphafold_data_dup[["parent_id", "id", "strength"]], on=["parent_id", "id"], how="left")

In [13]:
icites = catalog.load("pubmed.data_processing.icite.intermediate")

In [77]:
alphafold_data = alphafold_data_all

In [78]:
# Create a mapping from id to doi and pmid for each level
alphafold_data["parent_level"] = alphafold_data["level"] - 1

alphafold_data = pd.merge(
    alphafold_data,
    alphafold_data,
    left_on=["parent_id", "parent_level"],
    right_on=["id", "level"],
    how="left",
    suffixes=("", "_parent"),
)
alphafold_data.rename(
    columns={"doi_parent": "parent_doi", "pmid_parent": "parent_pmid"}, inplace=True
)

# drop the other _parent columns
alphafold_data.drop(
    columns=[col for col in alphafold_data.columns if "_parent" in col], inplace=True
)

# manually set the parent_pmid and parent_doi for AlphaFold
alphafold_data.loc[alphafold_data["level"] == 0, "parent_doi"] = (
    "10.1038/s41586-021-03819-2"
)
alphafold_data.loc[alphafold_data["level"] == 0, "parent_pmid"] = "34265844"


In [79]:
# explode strength
alphafold_data_exploded = alphafold_data.explode("strength") 

In [80]:
alphafold_data_exploded.dropna(subset=["strength"], inplace=True) # added with the "all" data

In [81]:
alphafold_data_exploded["intent"] = alphafold_data_exploded["strength"].apply(lambda x: x["intent"] if x else None)
alphafold_data_exploded["context"] = alphafold_data_exploded["strength"].apply(lambda x: x["context"] if x else None)

In [82]:
# W3013924576

In [83]:
# Define the custom sorting order
sort_order = {'methodology': 0, 'result': 1, 'background': 2, '': 3, None: 4}

# Create a new column 'sort_order' based on the 'intent' column
alphafold_data_exploded['sort_order'] = alphafold_data_exploded['intent'].map(sort_order)

# drop if intent is background, '', or None
alphafold_data_exploded = alphafold_data_exploded[~alphafold_data_exploded["intent"].isin(["background", "", None])]

# Sort the DataFrame by the 'sort_order' column and drop duplicates
# alphafold_data_exploded = alphafold_data_exploded.sort_values('sort_order').drop_duplicates(subset='id')

# Drop the 'sort_order' column as it's no longer needed
alphafold_data_exploded = alphafold_data_exploded.drop(columns='sort_order')

In [85]:
# keep only if pmid is not null
alphafold_data_exploded = alphafold_data_exploded[alphafold_data_exploded["pmid"].notnull()]

In [86]:
alphafold_data_exploded.reset_index(drop=True, inplace=True)

In [87]:
# Group the data by 'parent_id' and 'level', and aggregate the 'id' children into lists
grouped = alphafold_data_exploded.groupby(['parent_id', 'level'])['id'].apply(list).reset_index()

# Create a pivot table with 'id' as the values and 'level' as the columns
pivot_id = grouped.pivot(index='parent_id', columns='level', values='id')

# Rename the columns
pivot_id.columns = [f'level_{i}_id' for i in range(4)]

pivot_id

Unnamed: 0_level_0,level_0_id,level_1_id,level_2_id,level_3_id
parent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
W3113096828,,,,"[W4390345176, W4390345176]"
W3118451921,,,[W4206953760],
W3123083763,,,"[W3207224860, W3207224860]",
W3124804039,,[W3217186197],,
W3127618774,,[W4322769977],,
...,...,...,...,...
W4390005064,,,,"[W4391029856, W4391029856]"
W4390038867,,,,"[W4390332145, W4390332145]"
W4390402910,,,,[W4390933492]
W4390430072,,,"[W4390649572, W4390649572, W4390649572]","[W4390649572, W4390649572, W4390649572]"


In [88]:
alphafold_data_exploded.to_csv("baseline_pubmed.csv")

In [89]:
big_dict = {}

level_0 = alphafold_data_exploded.loc[alphafold_data_exploded.level==0]

unique_parent = level_0.parent_pmid.unique()

for parent in unique_parent:

    big_dict[parent] = {}
    # find all rows in level_0 with relevant key parent id
    matches = level_0.loc[level_0.parent_pmid==parent]["pmid"].to_list()

    for item in matches:
        big_dict[parent].update({item: {}})

In [90]:
big_dict


[1m{[0m
    [32m'34265844'[0m: [1m{[0m
        [32m'34791371'[0m: [1m{[0m[1m}[0m,
        [32m'35081335'[0m: [1m{[0m[1m}[0m,
        [32m'34718731'[0m: [1m{[0m[1m}[0m,
        [32m'35273146'[0m: [1m{[0m[1m}[0m,
        [32m'34759384'[0m: [1m{[0m[1m}[0m,
        [32m'34533838'[0m: [1m{[0m[1m}[0m,
        [32m'34599769'[0m: [1m{[0m[1m}[0m,
        [32m'34644530'[0m: [1m{[0m[1m}[0m,
        [32m'36420884'[0m: [1m{[0m[1m}[0m,
        [32m'35900023'[0m: [1m{[0m[1m}[0m,
        [32m'37733863'[0m: [1m{[0m[1m}[0m,
        [32m'35679397'[0m: [1m{[0m[1m}[0m,
        [32m'34861190'[0m: [1m{[0m[1m}[0m,
        [32m'35199061'[0m: [1m{[0m[1m}[0m,
        [32m'35953649'[0m: [1m{[0m[1m}[0m,
        [32m'35365602'[0m: [1m{[0m[1m}[0m,
        [32m'36419248'[0m: [1m{[0m[1m}[0m,
        [32m'35951700'[0m: [1m{[0m[1m}[0m,
        [32m'36108048'[0m: [1m{[0m[1m}[0m,
        [32m'35679401'

In [91]:
alphafold_data_exploded.intent.value_counts()


intent
methodology    [1;36m31572[0m
result         [1;36m10748[0m
Name: count, dtype: int64

In [92]:
def build_dict(df, parent, level):
    # Base case: if level is greater than 3, return an empty dictionary
    if level > 3:
        return {}
    
    # Find all rows in the current level with the relevant parent id
    matches = df.loc[df['parent_pmid'] == parent]['pmid'].to_list()
    
    # Initialize the dictionary for the current parent
    dict_ = {}
    
    # For each match, recursively build the dictionary for the next level
    for item in matches:
        dict_[item] = build_dict(df, item, level + 1)
    
    return dict_

# Initialize the dictionary with the AlphaFold paper
big_dict = build_dict(alphafold_data_exploded, '34265844', 0)

big_dict


[1m{[0m
    [32m'34791371'[0m: [1m{[0m
        [32m'36191189'[0m: [1m{[0m[1m}[0m,
        [32m'37522378'[0m: [1m{[0m[1m}[0m,
        [32m'36951104'[0m: [1m{[0m[1m}[0m,
        [32m'38141614'[0m: [1m{[0m[1m}[0m,
        [32m'38194144'[0m: [1m{[0m[1m}[0m,
        [32m'36606712'[0m: [1m{[0m[1m}[0m,
        [32m'38015436'[0m: [1m{[0m[1m}[0m,
        [32m'38168668'[0m: [1m{[0m[1m}[0m,
        [32m'35932489'[0m: [1m{[0m[1m}[0m,
        [32m'36866427'[0m: [1m{[0m[1m}[0m,
        [32m'37409874'[0m: [1m{[0m[1m}[0m,
        [32m'37665209'[0m: [1m{[0m[1m}[0m,
        [32m'38015445'[0m: [1m{[0m[1m}[0m,
        [32m'38063081'[0m: [1m{[0m[1m}[0m,
        [32m'37688380'[0m: [1m{[0m[1m}[0m,
        [32m'35844795'[0m: [1m{[0m[1m}[0m,
        [32m'35357484'[0m: [1m{[0m[1m}[0m,
        [32m'35518358'[0m: [1m{[0m[1m}[0m,
        [32m'36110544'[0m: [1m{[0m[1m}[0m,
        [32m'36786034'

In [68]:
# a limiting factor is that to avoid duplication and explosion in chains, we kept the works unique, ie. we only register when they appear once, irrespective of the type of strength. This would require running the old S2 pipeline. 
# Would the old pipeline work? Ultimately we care about just doing non-uniques at the level (we shouldn't care if it appears at a level below)

In [93]:
def flatten_dict(d, parent_keys=[], sep='_'):
    rows = []
    for k, v in d.items():
        new_keys = parent_keys + [k]
        if isinstance(v, dict) and v:
            rows.extend(flatten_dict(v, new_keys, sep=sep))
        else:
            row = {f'level_{i}': key for i, key in enumerate(new_keys)}
            rows.append(row)
    return rows

# Flatten the dictionary
flat_rows = flatten_dict(big_dict)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(flat_rows)

In [94]:
df["level_neg1"] = "34265844"

In [95]:
df = df[
    [
        "level_neg1",
        "level_0",
        "level_1",
        "level_2",
        "level_3",
    ]
]

In [97]:
# Create a copy of the DataFrame to avoid modifying the original one
df_copy = df.copy()

# For each level, merge the DataFrame with the alphafold_data_exploded DataFrame
for i in range(4):
    if i == 0:

        df_copy = df_copy.merge(
            alphafold_data_exploded[['parent_pmid', 'pmid', 'intent']],
            left_on=[f'level_neg1', f'level_0'],
            right_on=['parent_pmid', 'pmid'],
            how='left'
        )
    else:
        df_copy = df_copy.merge(
            alphafold_data_exploded[['parent_pmid', 'pmid', 'intent']],
            left_on=[f'level_{i-1}', f'level_{i}'],
            right_on=['parent_pmid', 'pmid'],
            how='left'
        )
    # Rename the intent column
    df_copy.rename(columns={'intent': f'intent_{i}'}, inplace=True)

    # drop duplicate columns
    df_copy.drop(columns=['parent_pmid', 'pmid'], inplace=True)

df_copy

Unnamed: 0,level_neg1,level_0,level_1,level_2,level_3,intent_0,intent_1,intent_2,intent_3
0,34265844,34791371,36191189,,,methodology,methodology,,
1,34265844,34791371,36191189,,,methodology,methodology,,
2,34265844,34791371,36191189,,,methodology,methodology,,
3,34265844,34791371,37522378,,,methodology,methodology,,
4,34265844,34791371,36951104,,,methodology,methodology,,
...,...,...,...,...,...,...,...,...,...
18014,34265844,36543879,,,,methodology,,,
18015,34265844,36555373,,,,methodology,,,
18016,34265844,36551274,,,,methodology,,,
18017,34265844,36671428,,,,methodology,,,


In [190]:
df_copy.to_csv("2_all_chains.csv")

In [108]:
# substitute empty string in intent for None
for i in range(4):
    df_copy[f'intent_{i}'] = df_copy[f'intent_{i}'].apply(lambda x: x if x != '' else None)

In [109]:
import numpy as np
# In the intent columns, replace NaN with 'N/A'
for i in range(4):
    df_copy[f'intent_{i}'] = df_copy[f'intent_{i}'].apply(lambda x: 'N/A' if x is np.nan else x)   

In [110]:
df_copy[df_copy["level_0"]=="37089814"]

Unnamed: 0,level_neg1,level_0,level_1,level_2,level_3,intent_0,intent_1,intent_2,intent_3


In [112]:
def breaks_chain(row):
    intents = ['intent_0', 'intent_1', 'intent_2', 'intent_3']
    for i in range(len(intents) - 1):
        if (pd.isna(row[intents[i]]) or row[intents[i]] == 'N/A') and not pd.isna(row[intents[i+1]]) and row[intents[i+1]] != 'N/A':
            return True
    return False

df_copy_full = df_copy[~df_copy.apply(breaks_chain, axis=1)]

# drop rows that have all four intent as nan or N/A
df_copy_full = df_copy_full[~df_copy_full[['intent_0', 'intent_1', 'intent_2', 'intent_3']].applymap(lambda x: x == 'N/A' or pd.isna(x)).all(axis=1)]

In [113]:
df_copy_full

Unnamed: 0,level_neg1,level_0,level_1,level_2,level_3,intent_0,intent_1,intent_2,intent_3
0,34265844,34791371,36191189,,,methodology,methodology,,
1,34265844,34791371,36191189,,,methodology,methodology,,
2,34265844,34791371,36191189,,,methodology,methodology,,
3,34265844,34791371,37522378,,,methodology,methodology,,
4,34265844,34791371,36951104,,,methodology,methodology,,
...,...,...,...,...,...,...,...,...,...
18014,34265844,36543879,,,,methodology,,,
18015,34265844,36555373,,,,methodology,,,
18016,34265844,36551274,,,,methodology,,,
18017,34265844,36671428,,,,methodology,,,


In [114]:
df_copy_full[df_copy_full["level_0"]=="37089814"]

Unnamed: 0,level_neg1,level_0,level_1,level_2,level_3,intent_0,intent_1,intent_2,intent_3


In [115]:
df_copy_full.to_csv("2_complete_chains.csv")

In [116]:
# drop if at least one intent (intent_0, intent_1, intent_2, intent_3) is "methodology"
df_copy_methods = df_copy_full[~df_copy_full[['intent_0', 'intent_1', 'intent_2', 'intent_3']].isin(['background']).any(axis=1)]

In [117]:
df_copy_methods

Unnamed: 0,level_neg1,level_0,level_1,level_2,level_3,intent_0,intent_1,intent_2,intent_3
0,34265844,34791371,36191189,,,methodology,methodology,,
1,34265844,34791371,36191189,,,methodology,methodology,,
2,34265844,34791371,36191189,,,methodology,methodology,,
3,34265844,34791371,37522378,,,methodology,methodology,,
4,34265844,34791371,36951104,,,methodology,methodology,,
...,...,...,...,...,...,...,...,...,...
18014,34265844,36543879,,,,methodology,,,
18015,34265844,36555373,,,,methodology,,,
18016,34265844,36551274,,,,methodology,,,
18017,34265844,36671428,,,,methodology,,,


In [118]:
df_copy_methods.to_csv("3_relevant_chains.csv")

In [119]:
def transform_long(data, intents):

    # select all level_0 with an intent_0 that is either result or methodology
    level_0 = data.loc[
        df_copy['intent_0'].isin(intents)
    ]

    level_0_pmids = list(set(level_0["level_0"].to_list()))

    # do the same for level_1 if level_0 is result and methodology
    level_1 = level_0.loc[
        level_0['intent_1'].isin(intents)
    ]

    level_1_pmids = list(set(level_1["level_1"].to_list()))

    # do the same for level_2 if level_1 is result and methodology
    level_2 = level_1.loc[
        level_1['intent_2'].isin(intents)
    ]

    level_2_pmids = list(set(level_2["level_2"].to_list()))

    # do the same for level_3 if level_2 is result and methodology
    level_3 = level_2.loc[
        level_2['intent_3'].isin(intents)
    ]

    level_3_pmids = list(set(level_3["level_3"].to_list()))

    # Concatenate the lists of PMIDs
    pmids = level_0_pmids + level_1_pmids + level_2_pmids + level_3_pmids

    # Create a list of levels
    levels = [0]*len(level_0_pmids) + [1]*len(level_1_pmids) + [2]*len(level_2_pmids) + [3]*len(level_3_pmids)

    # Create a dataframe from the lists of PMIDs and levels
    df_pmids = pd.DataFrame({
        'pmids': pmids,
        'level': levels
    })

    df_pmids["pmids"] = df_pmids["pmids"].astype(int)

    df_pmids.drop_duplicates(subset=['pmids'], inplace=True)

    return df_pmids

In [120]:
df_pmids_all = transform_long(df_copy_full, ["result", "methodology", "background"])

In [122]:
df_pmids_all

Unnamed: 0,pmids,level
0,36844395,0
1,37338376,0
2,36829979,0
3,35668373,0
4,36338847,0
...,...,...
3374,36904782,3
3375,37766145,3
3376,37632651,3
3377,36366583,3


In [123]:
df_pmids = transform_long(df_copy_full, ["result", "methodology"])

In [125]:
# iterate over pmids to find the matching icites "pmid" value, and return "cited_by_clin"
df_pmids["cited_by_clin"] = df_pmids["pmids"].apply(lambda x: icites.loc[icites["pmid"]==x]["cited_by_clin"].values[0] if x in icites["pmid"].to_list() else None)

In [126]:
df_pmids_nona = df_pmids[df_pmids["cited_by_clin"].astype(str) != "nan"]

In [128]:
# split cited_by_clin by " " and then explode the df
df_pmids_nona["cited_by_clin"] = df_pmids_nona["cited_by_clin"].apply(lambda x: x.split(" "))
df_pmids_nona = df_pmids_nona.explode("cited_by_clin")

In [129]:
df_pmids_nona["ca_link"] = df_pmids_nona["cited_by_clin"].apply(lambda x: f"https://pubmed.ncbi.nlm.nih.gov/{x}")

In [131]:
df_pmids_nona.rename(columns={"pmids":"paper_pmid"}, inplace=True)

In [132]:
df_pmids_nona.reset_index(inplace=True, drop=True)

In [140]:
# drop pmids, cited_by_clin duplicates
df_pmids_nona.drop_duplicates(subset=["paper_pmid", "cited_by_clin"], inplace=True)

## Label CA with Biopython

In [144]:
import pandas as pd
from Bio import Entrez
Entrez.email = "david.ampudia@nesta.org.uk" 

In [145]:
df_pmids_nona["pmid"] = df_pmids_nona["ca_link"].str.replace("https://pubmed.ncbi.nlm.nih.gov/", "")

In [146]:
def get_entrez_ptype(pmid):
    stream = Entrez.efetch(db="pubmed", id=pmid, retmax="1")
    record = Entrez.read(stream)
    return str(record["PubmedArticle"][0]["MedlineCitation"]["Article"].get("PublicationTypeList")[0])

In [147]:
df_pmids_nona["publication_type"] = df_pmids_nona["pmid"].apply(get_entrez_ptype)

In [148]:
df_pmids_nona.to_csv("4_clinical_trials_using_AF.csv")

In [151]:
# get the counts of each level and publication type pair
output_types = df_pmids_nona.groupby(["level", "publication_type"]).size().reset_index(name='counts')

output_types.to_csv("5_output_types.csv")

In [171]:
# match the pmid with the pmid in alphafold_data
df_pmids_nona["pmid"] = df_pmids_nona["pmid"].astype(str)

df_pmids_nona = df_pmids_nona.merge(alphafold_data_exploded[["pmid", "intent"]], on="pmid", how="left")


In [173]:
alphafold_data_exploded

Unnamed: 0,parent_id,id,pmid,level,doi,publication_date,mesh_terms,cited_by_count,authorships,strength,parent_level,parent_pmid,parent_doi,intent,context
0,W3177828909,W3211795435,34791371,0,10.1093/nar/gkab1061,2021-11-17,"[[D030562, Databases, Protein], [D017510, Prot...",3590,"[[A5054254768, I1303153112, first], [A50577045...",{'context': 'The methodology of AlphaFold and ...,-1,34265844,10.1038/s41586-021-03819-2,methodology,The methodology of AlphaFold and insights gain...
1,W3177828909,W4206153788,35081335,0,10.1016/j.cell.2021.12.046,2022-02-01,,786,"[[A5072476030, I1336263701, first], [A50724760...",{'context': 'The structural impact of the nume...,-1,34265844,10.1038/s41586-021-03819-2,methodology,The structural impact of the numerous mutation...
2,W3177828909,W4206153788,35081335,0,10.1016/j.cell.2021.12.046,2022-02-01,,786,"[[A5072476030, I1336263701, first], [A50724760...",{'context': 'Models of Omicron RBD andNTDwere ...,-1,34265844,10.1038/s41586-021-03819-2,methodology,Models of Omicron RBD andNTDwere derived using...
3,W3177828909,W4206153788,35081335,0,10.1016/j.cell.2021.12.046,2022-02-01,,786,"[[A5072476030, I1336263701, first], [A50724760...","{'context': '01 (Jumper et al., 2021) download...",-1,34265844,10.1038/s41586-021-03819-2,methodology,"01 (Jumper et al., 2021) downloaded and instal..."
4,W3177828909,W3211014030,34718731,0,10.1093/nar/gkab951,2021-10-28,"[[D016208, Databases, Factual], [D000818, Anim...",371,"[[A5063323901, , first]]","{'context': '…from all resources in CNCB-NGDC,...",-1,34265844,10.1038/s41586-021-03819-2,methodology,"…from all resources in CNCB-NGDC, 39 partner r..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42315,W4319986951,W4382723733,37447859,3,10.3390/s23136010,2023-06-28,"[[D060825, Cognitive Dysfunction], [D003704, D...",0,"[[A5059820754, I12832649, first], [A5078646311...",{'context': 'The cognitive therapy method usin...,2,,10.1109/access.2023.3241628,methodology,The cognitive therapy method using the metaver...
42316,W4319986951,W4382723733,37447859,3,10.3390/s23136010,2023-06-28,"[[D060825, Cognitive Dysfunction], [D003704, D...",0,"[[A5059820754, I12832649, first], [A5078646311...",{'context': 'The cognitive therapy method usin...,2,,10.1109/access.2023.3241628,methodology,The cognitive therapy method using the metaver...
42317,W4319986951,W4382723733,37447859,3,10.3390/s23136010,2023-06-28,"[[D060825, Cognitive Dysfunction], [D003704, D...",0,"[[A5059820754, I12832649, first], [A5078646311...",{'context': 'The cognitive therapy method usin...,2,,10.1109/access.2023.3241628,methodology,The cognitive therapy method using the metaver...
42318,W4317757150,W4384461266,37560710,3,10.1016/j.heliyon.2023.e18153,2023-08-01,,0,"[[A5041389967, I17404232, first], [A5041389967...","{'context': 'Besides, 62 nm ZnO NPs fabricated...",2,36747522,10.1016/j.heliyon.2023.e13089,result,"Besides, 62 nm ZnO NPs fabricated from pH 10 w..."


In [172]:
df_pmids_nona

Unnamed: 0,paper_pmid,level,cited_by_clin,ca_link,pmid,publication_type,intent
0,35655773,0,36404171,https://pubmed.ncbi.nlm.nih.gov/36404171,36404171,Randomized Controlled Trial,
1,35655773,0,36211366,https://pubmed.ncbi.nlm.nih.gov/36211366,36211366,Journal Article,
2,35104687,0,38254046,https://pubmed.ncbi.nlm.nih.gov/38254046,38254046,Observational Study,
3,34718731,0,37094424,https://pubmed.ncbi.nlm.nih.gov/37094424,37094424,Clinical Trial,
4,34718731,0,36357415,https://pubmed.ncbi.nlm.nih.gov/36357415,36357415,"Clinical Trial, Phase II",
5,34718731,0,36532042,https://pubmed.ncbi.nlm.nih.gov/36532042,36532042,Observational Study,
6,34718731,0,36723013,https://pubmed.ncbi.nlm.nih.gov/36723013,36723013,Randomized Controlled Trial,
7,34718731,0,37580352,https://pubmed.ncbi.nlm.nih.gov/37580352,37580352,"Clinical Trial, Phase II",
8,35081335,0,35710572,https://pubmed.ncbi.nlm.nih.gov/35710572,35710572,Clinical Trial Protocol,
9,35081335,0,36411267,https://pubmed.ncbi.nlm.nih.gov/36411267,36411267,Randomized Controlled Trial,


#### Doing all chains

In [312]:
stream = Entrez.efetch(db="pubmed", id="34265844", retmax="1")
record = Entrez.read(stream)

## Clinical Trials from NIH

In [152]:
clinical_trials_links_to_papers = catalog.load("nih.data_processing.clinical_trials_links_to_papers.intermediate")

In [153]:
clinical_trials_links_to_papers["ref_pmid"] = clinical_trials_links_to_papers["ref_pmid"].fillna("0")
clinical_trials_links_to_papers["ref_pmid"] = clinical_trials_links_to_papers["ref_pmid"].astype(np.int64).astype(str)

In [154]:
doi_matches = (
    alphafold_data
    .dropna(subset=['doi'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_doi']), how="inner", left_on="doi", right_on="ref_doi")
)

# Find matches based on pmid
pmid_matches = (
    alphafold_data
    .dropna(subset=['pmid'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_pmid']), how="inner", left_on="pmid", right_on="ref_pmid")
)

# Concat pmid and doi matches and removes duplicates
papers_that_cite_af_cited_in_ct = pd.concat([doi_matches, pmid_matches]).drop_duplicates(subset="id")

In [155]:
# explode strength
papers_that_cite_af_cited_in_ct =papers_that_cite_af_cited_in_ct.explode("strength") 

papers_that_cite_af_cited_in_ct["intent"] = papers_that_cite_af_cited_in_ct["strength"].apply(lambda x: x["intent"] if x else None)
papers_that_cite_af_cited_in_ct["context"] = papers_that_cite_af_cited_in_ct["strength"].apply(lambda x: x["context"] if x else None)

# filter rows to have unique ids. We should prioritise rows with intent being result or methodology, then followed by background, then followed by other
papers_that_cite_af_cited_in_ct = papers_that_cite_af_cited_in_ct.sort_values("intent", ascending=False).drop_duplicates(subset="id").reset_index(drop=True)

In [156]:
papers_that_cite_af_cited_in_ct["publication_type"] = papers_that_cite_af_cited_in_ct["pmid"].apply(get_entrez_ptype)

In [324]:
z = papers_that_cite_af_cited_in_ct[
    ['id', 'doi', 'pmid', 'level',
       'publication_date', 'mesh_terms', 'cited_by_count', 'authorships',
       'parent_level', 'ref_citation', "publication_type"]
]
z.to_csv("clinical_trials_citations.csv")

In [157]:
df_pmids_strong = transform_long(df_copy_full, ["result", "methodology"])
df_pmids_all = transform_long(df_copy_full, ["result", "methodology", "background"])

In [158]:
df_pmids["pmids"]


[1;36m0[0m       [1;36m36844395[0m
[1;36m1[0m       [1;36m37338376[0m
[1;36m2[0m       [1;36m36829979[0m
[1;36m3[0m       [1;36m35668373[0m
[1;36m4[0m       [1;36m36338847[0m
          [33m...[0m   
[1;36m3374[0m    [1;36m36904782[0m
[1;36m3375[0m    [1;36m37766145[0m
[1;36m3376[0m    [1;36m37632651[0m
[1;36m3377[0m    [1;36m36366583[0m
[1;36m3378[0m    [1;36m37374958[0m
Name: pmids, Length: [1;36m3379[0m, dtype: int64

In [159]:
papers_that_cite_af_cited_in_ct["pmid"]


[1;36m0[0m      [1;36m36385449[0m
[1;36m1[0m      [1;36m35305258[0m
[1;36m2[0m      [1;36m35617368[0m
[1;36m3[0m      [1;36m36899952[0m
[1;36m4[0m      [1;36m36200701[0m
         [33m...[0m   
[1;36m236[0m    [1;36m35605625[0m
[1;36m237[0m    [1;36m37154020[0m
[1;36m238[0m    [1;36m36403582[0m
[1;36m239[0m    [1;36m36473651[0m
[1;36m240[0m    [1;36m35764593[0m
Name: pmid, Length: [1;36m241[0m, dtype: object

In [160]:
# check how many of the pmid are in df_pmids
df_pmids_nona[df_pmids_nona["pmid"].isin(papers_that_cite_af_cited_in_ct["pmid"])]["pmid"].to_list()

[1m[[0m[32m'35710572'[0m, [32m'35443106'[0m, [32m'35688164'[0m, [32m'35798000'[0m, [32m'35834252'[0m[1m][0m

In [164]:
# check how many of the pmid are in df_pmids
df_pmids_nona[df_pmids_nona["paper_pmid"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

Unnamed: 0,paper_pmid,level,cited_by_clin,ca_link,pmid,publication_type


In [166]:
# check how many of the pmid are in df_pmids
df_pmids_strong[df_pmids_strong["pmids"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

Unnamed: 0,pmids,level


In [168]:
# check how many of the pmid are in df_pmids
df_pmids_all[df_pmids_all["pmids"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

Unnamed: 0,pmids,level


In [204]:
df_pmids[df_pmids["pmids"] == 37089814]

Unnamed: 0,pmids,level


In [201]:
papers_that_cite_af_cited_in_ct

Unnamed: 0,parent_id,parent_doi,parent_pmid,id,doi,pmid,level,publication_date,mesh_terms,cited_by_count,authorships,parent_level,strength,nct_id,ref_pmid,ref_doi,ref_citation,ref_type,ref_retraction_list
0,W3177828909,10.1038/s41586-021-03819-2,34265844,W4207015910,10.3390/nu14020274,35057455,0,2022-01-10,"[[D000086382, COVID-19], [D000086382, COVID-19...",39,"[[A5075210718, I4210096221, first], [A50874775...",-1,,NCT05474144,35057455,10.3390/nu14020274,"Nguyen QV, Chong LC, Hor YY, Lew LC, Rather IA...",background,
1,W3177828909,10.1038/s41586-021-03819-2,34265844,W4307297657,10.1016/j.ejmech.2022.114861,36332549,0,2022-12-01,"[[D002289, Carcinoma, Non-Small-Cell Lung], [D...",18,"[[A5053410112, I103890011, first], [A503025490...",-1,,NCT06134388,36332549,10.1016/j.ejmech.2022.114861,"Yin L, Liu P, Jin Y, Ning Z, Yang Y, Gao H. Fe...",background,
2,W3177828909,10.1038/s41586-021-03819-2,34265844,W4311811125,10.1186/s13023-022-02592-3,36528660,0,2022-12-17,"[[D018981, Congenital Disorders of Glycosylati...",7,"[[A5053607431, I169912796, first], [A507984668...",-1,,NCT06199531,36528660,10.1186/s13023-022-02592-3,"Stanclift CR, Dwight SS, Lee K, Eijkenboom QL,...",background,
3,W3177828909,10.1038/s41586-021-03819-2,34265844,W4366395412,10.1093/narcan/zcad017,37089814,0,2023-03-11,,2,"[[A5078774318, I107672454, first], [A501743967...",-1,[{'context': 'Perhaps the most popular example...,NCT06140992,37089814,10.1093/narcan/zcad017,"Zhang Z, Lu Y, Vosoughi S, Levy JJ, Christense...",result,
4,W4200519222,10.1101/2021.12.16.472979,,W4205797652,10.1021/acs.biochem.1c00758,34989554,1,2022-01-06,"[[D064307, Microbiota], [D014780, Viruses], [D...",7,"[[A5078081844, I4210140114, first], [A50780818...",0,,NCT05906641,34989554,10.1021/acs.biochem.1c00758,"Zhao X, Yang X, Hang HC. Chemoproteomic Analys...",result,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,W4308959764,10.1158/2159-8290.cd-22-0342,36374194,W4365997950,10.1158/2159-8290.cd-22-1020,37061969,3,2023-04-16,"[[D000970, Antineoplastic Agents], [D009369, N...",6,"[[A5039574165, I2799503643, first], [A50623209...",2,,NCT06009835,37061969,10.1158/2159-8290.CD-22-1020,"Mundi PS, Dela Cruz FS, Grunn A, Diolaiti D, M...",background,
274,W4389475443,10.1371/journal.pone.0295418,38064447,W4375862750,10.1161/circulationaha.123.063901,37154020,3,2023-06-20,"[[D000086382, COVID-19], [D013927, Thrombosis]...",7,"[[A5047198456, I1283280774, first], [A50471984...",2,,NCT04508023,37154020,10.1161/CIRCULATIONAHA.123.063901,"Piazza G, Spyropoulos AC, Hsia J, Goldin M, To...",result,
277,W4295899573,10.15585/mmwr.mm7137a4,36107788,W4379380907,10.7326/m22-3565,37276589,3,2023-06-01,"[[D000086382, COVID-19], [D014728, Veterans], ...",23,"[[A5068844252, I165690674, first], [A503144217...",2,,NCT06160128,37276589,10.7326/M22-3565,"Bajema KL, Berry K, Streja E, Rajeevan N, Li Y...",result,
282,W4225295888,10.1001/jamanetworkopen.2022.9317,35482308,W4382362292,10.1097/phh.0000000000001780,37379511,3,2023-06-28,"[[D000086382, COVID-19], [D000328, Adult], [D0...",1,"[[A5007300429, I1311060795, first], [A50073004...",2,,NCT04437706,37379511,10.1097/PHH.0000000000001780,"Pasquale DK, Welsh W, Olson A, Yacoub M, Moody...",result,


In [200]:
papers_that_cite_af_cited_in_ct["strength"].iloc[3]


[1;35marray[0m[1m([0m[1m[[0m[1m{[0m[32m'context'[0m: [32m'Perhaps the most popular example of these technologies is the AI-powered protein folding prediction research [0m[32m([0m[32m20[0m[32m)[0m[32m.'[0m, [32m'influential'[0m: [3;91mFalse[0m, [32m'intent'[0m: [32m'background'[0m[1m}[0m[1m][0m,
      [33mdtype[0m=[35mobject[0m[1m)[0m