In [2]:
%load_ext kedro.ipython

In [3]:
import numpy as np
import pandas as pd

In [3]:
alphafold_data = catalog.load("oa.data_processing.depth.all.primary")

In [None]:
alphafold_data_int = catalog.load("oa.data_processing.depth.no_mesh.3.intermediate")

In [None]:
alphafold_data_int[alphafold_data_int["parent_id"]=="W3211014030"]

In [None]:
alphafold_data_int[alphafold_data_int["parent_id"]=="W3211014030"]

In [None]:
alphafold_data_int[alphafold_data_int["pmid"]=="34718731"]

In [None]:
icites = catalog.load("pubmed.data_processing.icite.intermediate")

In [None]:
alphafold_data_all = catalog.load("oa.data_processing.depth.intermediate")

In [None]:
alphafold_data_all[alphafold_data_all["level"]==0]["parent_id"].value_counts()

In [None]:
alphafold_data[alphafold_data["level"]==0]["parent_id"].value_counts()

In [None]:
# explode strength
alphafold_data_exploded = alphafold_data.explode("strength") 

In [None]:
alphafold_data_exploded.dropna(subset=["strength"], inplace=True) # added with the "all" data

In [None]:
alphafold_data_exploded["intent"] = alphafold_data_exploded["strength"].apply(lambda x: x["intent"] if x else None)
alphafold_data_exploded["context"] = alphafold_data_exploded["strength"].apply(lambda x: x["context"] if x else None)

In [None]:
# W3013924576

In [None]:
# Define the custom sorting order
sort_order = {'methodology': 0, 'result': 1, 'background': 2, '': 3, None: 4}

# Create a new column 'sort_order' based on the 'intent' column
alphafold_data_exploded['sort_order'] = alphafold_data_exploded['intent'].map(sort_order)

# drop if intent is background, '', or None
alphafold_data_exploded = alphafold_data_exploded[~alphafold_data_exploded["intent"].isin(["background", "", None])]

# Sort the DataFrame by the 'sort_order' column and drop duplicates
# alphafold_data_exploded = alphafold_data_exploded.sort_values('sort_order').drop_duplicates(subset='id')

# Drop the 'sort_order' column as it's no longer needed
alphafold_data_exploded = alphafold_data_exploded.drop(columns='sort_order')

In [None]:
# keep only if pmid is not null
alphafold_data_exploded = alphafold_data_exploded[alphafold_data_exploded["pmid"].notnull()]

In [None]:
alphafold_data_exploded.reset_index(drop=True, inplace=True)

In [None]:
# Group the data by 'parent_id' and 'level', and aggregate the 'id' children into lists
grouped = alphafold_data_exploded.groupby(['parent_id', 'level'])['id'].apply(list).reset_index()

# Create a pivot table with 'id' as the values and 'level' as the columns
pivot_id = grouped.pivot(index='parent_id', columns='level', values='id')

# Rename the columns
pivot_id.columns = [f'level_{i}_id' for i in range(4)]

pivot_id

In [None]:
alphafold_data_exploded.to_csv("baseline_pubmed.csv")

In [None]:
big_dict = {}

level_0 = alphafold_data_exploded.loc[alphafold_data_exploded.level==0]

unique_parent = level_0.parent_pmid.unique()

for parent in unique_parent:

    big_dict[parent] = {}
    # find all rows in level_0 with relevant key parent id
    matches = level_0.loc[level_0.parent_pmid==parent]["pmid"].to_list()

    for item in matches:
        big_dict[parent].update({item: {}})

In [None]:
big_dict

In [None]:
alphafold_data_exploded.intent.value_counts()

In [None]:
alphafold_data_exploded[alphafold_data_exploded.level==0]["parent_id"].value_counts()

In [None]:
def build_dict(df, parent, level):
    # Base case: if level is greater than 3, return an empty dictionary
    if level > 3:
        return {}
    
    # Find all rows in the current level with the relevant parent id
    matches = df.loc[df['parent_pmid'] == parent]['pmid'].to_list()
    
    # Initialize the dictionary for the current parent
    dict_ = {}
    
    # For each match, recursively build the dictionary for the next level
    for item in matches:
        dict_[item] = build_dict(df, item, level + 1)
    
    return dict_

# Initialize the dictionary with the AlphaFold paper #TOFIX
big_dict = build_dict(alphafold_data_exploded, '34265844', 0)

big_dict

In [None]:
# a limiting factor is that to avoid duplication and explosion in chains, we kept the works unique, ie. we only register when they appear once, irrespective of the type of strength. This would require running the old S2 pipeline. 
# Would the old pipeline work? Ultimately we care about just doing non-uniques at the level (we shouldn't care if it appears at a level below)

In [None]:
def flatten_dict(d, parent_keys=[], sep='_'):
    rows = []
    for k, v in d.items():
        new_keys = parent_keys + [k]
        if isinstance(v, dict) and v:
            rows.extend(flatten_dict(v, new_keys, sep=sep))
        else:
            row = {f'level_{i}': key for i, key in enumerate(new_keys)}
            rows.append(row)
    return rows

# Flatten the dictionary
flat_rows = flatten_dict(big_dict)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(flat_rows)

In [None]:
df

In [None]:
df["level_neg1"] = "34265844"

In [None]:
df = df[
    [
        "level_neg1",
        "level_0",
        "level_1",
        "level_2",
        "level_3",
    ]
]

In [None]:
df

In [None]:
# Create a copy of the DataFrame to avoid modifying the original one
df_copy = df.copy()

# For each level, merge the DataFrame with the alphafold_data_exploded DataFrame
for i in range(4):
    if i == 0:

        df_copy = df_copy.merge(
            alphafold_data_exploded[['parent_pmid', 'pmid', 'intent']],
            left_on=[f'level_neg1', f'level_0'],
            right_on=['parent_pmid', 'pmid'],
            how='left'
        )
    else:
        df_copy = df_copy.merge(
            alphafold_data_exploded[['parent_pmid', 'pmid', 'intent']],
            left_on=[f'level_{i-1}', f'level_{i}'],
            right_on=['parent_pmid', 'pmid'],
            how='left'
        )
    # Rename the intent column
    df_copy.rename(columns={'intent': f'intent_{i}'}, inplace=True)

    # drop duplicate columns
    df_copy.drop(columns=['parent_pmid', 'pmid'], inplace=True)

df_copy

In [None]:
df_copy.to_csv("2_all_chains.csv")

In [None]:
# substitute empty string in intent for None
for i in range(4):
    df_copy[f'intent_{i}'] = df_copy[f'intent_{i}'].apply(lambda x: x if x != '' else None)

In [None]:
import numpy as np
# In the intent columns, replace NaN with 'N/A'
for i in range(4):
    df_copy[f'intent_{i}'] = df_copy[f'intent_{i}'].apply(lambda x: 'N/A' if x is np.nan else x)   

In [None]:
df_copy[df_copy["level_0"]=="37089814"]

In [None]:
def breaks_chain(row):
    intents = ['intent_0', 'intent_1', 'intent_2', 'intent_3']
    for i in range(len(intents) - 1):
        if (pd.isna(row[intents[i]]) or row[intents[i]] == 'N/A') and not pd.isna(row[intents[i+1]]) and row[intents[i+1]] != 'N/A':
            return True
    return False

df_copy_full = df_copy[~df_copy.apply(breaks_chain, axis=1)]

# drop rows that have all four intent as nan or N/A
df_copy_full = df_copy_full[~df_copy_full[['intent_0', 'intent_1', 'intent_2', 'intent_3']].applymap(lambda x: x == 'N/A' or pd.isna(x)).all(axis=1)]

In [None]:
df_copy_full

In [None]:
df_copy_full[df_copy_full["level_0"]=="37089814"]

In [None]:
df_copy_full.to_csv("2_complete_chains.csv")

In [None]:
# drop if at least one intent (intent_0, intent_1, intent_2, intent_3) is "methodology"
df_copy_methods = df_copy_full[~df_copy_full[['intent_0', 'intent_1', 'intent_2', 'intent_3']].isin(['background']).any(axis=1)]

In [None]:
df_copy_methods

In [None]:
df_copy_methods.to_csv("3_relevant_chains.csv")

In [None]:
def transform_long(data, intents):

    # select all level_0 with an intent_0 that is either result or methodology
    level_0 = data.loc[
        df_copy['intent_0'].isin(intents)
    ]

    level_0_pmids = list(set(level_0["level_0"].to_list()))

    # do the same for level_1 if level_0 is result and methodology
    level_1 = level_0.loc[
        level_0['intent_1'].isin(intents)
    ]

    level_1_pmids = list(set(level_1["level_1"].to_list()))

    # do the same for level_2 if level_1 is result and methodology
    level_2 = level_1.loc[
        level_1['intent_2'].isin(intents)
    ]

    level_2_pmids = list(set(level_2["level_2"].to_list()))

    # do the same for level_3 if level_2 is result and methodology
    level_3 = level_2.loc[
        level_2['intent_3'].isin(intents)
    ]

    level_3_pmids = list(set(level_3["level_3"].to_list()))

    # Concatenate the lists of PMIDs
    pmids = level_0_pmids + level_1_pmids + level_2_pmids + level_3_pmids

    # Create a list of levels
    levels = [0]*len(level_0_pmids) + [1]*len(level_1_pmids) + [2]*len(level_2_pmids) + [3]*len(level_3_pmids)

    # Create a dataframe from the lists of PMIDs and levels
    df_pmids = pd.DataFrame({
        'pmids': pmids,
        'level': levels
    })

    df_pmids["pmids"] = df_pmids["pmids"].astype(int)

    df_pmids.drop_duplicates(subset=['pmids'], inplace=True)

    return df_pmids

In [None]:
df_pmids_all = transform_long(df_copy_full, ["result", "methodology", "background"])

In [None]:
df_pmids_all

In [None]:
df_pmids = transform_long(df_copy_full, ["result", "methodology"])

In [None]:
df_pmids["pmids"]

In [None]:
icites.head(2)

In [None]:
# iterate over pmids to find the matching icites "pmid" value, and return "cited_by_clin"
df_pmids["cited_by_clin"] = df_pmids["pmids"].apply(lambda x: icites.loc[icites["pmid"]==x]["cited_by_clin"].values[0] if x in icites["pmid"].to_list() else None)

In [None]:
df_pmids_nona = df_pmids[df_pmids["cited_by_clin"].astype(str) != "nan"]

# drop if cited_by_clin is None
df_pmids_nona = df_pmids_nona[df_pmids_nona["cited_by_clin"].notnull()]

In [None]:
# split cited_by_clin by " " and then explode the df
df_pmids_nona["cited_by_clin"] = df_pmids_nona["cited_by_clin"].apply(lambda x: x.split(" "))
df_pmids_nona = df_pmids_nona.explode("cited_by_clin")

In [None]:
df_pmids_nona["ca_link"] = df_pmids_nona["cited_by_clin"].apply(lambda x: f"https://pubmed.ncbi.nlm.nih.gov/{x}")

In [None]:
df_pmids_nona.rename(columns={"pmids":"paper_pmid"}, inplace=True)

In [None]:
df_pmids_nona.reset_index(inplace=True, drop=True)

In [None]:
# drop pmids, cited_by_clin duplicates
df_pmids_nona.drop_duplicates(subset=["paper_pmid", "cited_by_clin"], inplace=True)

## Label CA with Biopython

In [None]:
import pandas as pd
from Bio import Entrez
Entrez.email = "david.ampudia@nesta.org.uk" 

In [None]:
df_pmids_nona["pmid"] = df_pmids_nona["ca_link"].str.replace("https://pubmed.ncbi.nlm.nih.gov/", "")

In [None]:
def get_entrez_ptype(pmid):
    stream = Entrez.efetch(db="pubmed", id=pmid, retmax="1")
    record = Entrez.read(stream)
    return str(record["PubmedArticle"][0]["MedlineCitation"]["Article"].get("PublicationTypeList")[0])

In [None]:
df_pmids_nona["publication_type"] = df_pmids_nona["pmid"].apply(get_entrez_ptype)

In [None]:
df_pmids_nona.to_csv("4_clinical_trials_using_AF.csv")

In [None]:
# get the counts of each level and publication type pair
output_types = df_pmids_nona.groupby(["level", "publication_type"]).size().reset_index(name='counts')

output_types.to_csv("5_output_types.csv")

In [None]:
alphafold_data[alphafold_data["parent_pmid"]=="35637307"]["id"]

In [None]:
alphafold_data[alphafold_data["pmid"] == "34718731"]

In [None]:
alphafold_data[(alphafold_data["parent_pmid"] == "34718731") & (alphafold_data["level"]==3)]

In [None]:
df_pmids[df_pmids["pmids"] == 34718731]

In [None]:
# match the pmid with the pmid in alphafold_data
df_pmids_nona["pmid"] = df_pmids_nona["pmid"].astype(str)
df_pmids_nona["paper_pmid"] = df_pmids_nona["paper_pmid"].astype(str)

df_pmids_nona = df_pmids_nona.merge(alphafold_data_exploded[["parent_pmid", "pmid", "intent"]], left_on=["paper_pmid", "pmid"], right_on=["parent_pmid", "pmid"], how="left")

In [None]:
df_pmids_nona["parent_pmid"].value_counts()

In [None]:
df_pmids_nona[df_pmids_nona["intent"]=="result"]

In [None]:
alphafold_data_exploded

In [None]:
df_pmids_nona

In [None]:
df_pmids_nona.intent.value_counts()

#### Doing all chains

In [None]:
stream = Entrez.efetch(db="pubmed", id="34265844", retmax="1")
record = Entrez.read(stream)

## Clinical Trials from NIH

In [None]:
clinical_trials_links_to_papers = catalog.load("nih.data_processing.clinical_trials_links_to_papers.intermediate")

In [None]:
clinical_trials_links_to_papers["ref_pmid"] = clinical_trials_links_to_papers["ref_pmid"].fillna("0")
clinical_trials_links_to_papers["ref_pmid"] = clinical_trials_links_to_papers["ref_pmid"].astype(np.int64).astype(str)

In [None]:
doi_matches = (
    alphafold_data
    .dropna(subset=['doi'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_doi']), how="inner", left_on="doi", right_on="ref_doi")
)

# Find matches based on pmid
pmid_matches = (
    alphafold_data
    .dropna(subset=['pmid'])
    .merge(clinical_trials_links_to_papers.dropna(subset=['ref_pmid']), how="inner", left_on="pmid", right_on="ref_pmid")
)

# Concat pmid and doi matches and removes duplicates
papers_that_cite_af_cited_in_ct = pd.concat([doi_matches, pmid_matches]).drop_duplicates(subset="id")

In [None]:
# explode strength
papers_that_cite_af_cited_in_ct =papers_that_cite_af_cited_in_ct.explode("strength") 

papers_that_cite_af_cited_in_ct["intent"] = papers_that_cite_af_cited_in_ct["strength"].apply(lambda x: x["intent"] if x else None)
papers_that_cite_af_cited_in_ct["context"] = papers_that_cite_af_cited_in_ct["strength"].apply(lambda x: x["context"] if x else None)

# filter rows to have unique ids. We should prioritise rows with intent being result or methodology, then followed by background, then followed by other
papers_that_cite_af_cited_in_ct = papers_that_cite_af_cited_in_ct.sort_values("intent", ascending=False).drop_duplicates(subset="id").reset_index(drop=True)

In [None]:
papers_that_cite_af_cited_in_ct["publication_type"] = papers_that_cite_af_cited_in_ct["pmid"].apply(get_entrez_ptype)

In [None]:
z = papers_that_cite_af_cited_in_ct[
    ['id', 'doi', 'pmid', 'level',
       'publication_date', 'mesh_terms', 'cited_by_count', 'authorships',
       'parent_level', 'ref_citation', "publication_type"]
]
z.to_csv("clinical_trials_citations.csv")

In [None]:
df_pmids_strong = transform_long(df_copy_full, ["result", "methodology"])
df_pmids_all = transform_long(df_copy_full, ["result", "methodology", "background"])

In [None]:
df_pmids["pmids"]

In [None]:
papers_that_cite_af_cited_in_ct["pmid"]

In [None]:
# check how many of the pmid are in df_pmids
df_pmids_nona[df_pmids_nona["pmid"].isin(papers_that_cite_af_cited_in_ct["pmid"])]["pmid"].to_list()

In [None]:
# check how many of the pmid are in df_pmids
df_pmids_nona[df_pmids_nona["paper_pmid"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

In [None]:
# check how many of the pmid are in df_pmids
df_pmids_strong[df_pmids_strong["pmids"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

In [None]:
# check how many of the pmid are in df_pmids
df_pmids_all[df_pmids_all["pmids"].isin(papers_that_cite_af_cited_in_ct["pmid"].astype(int))]

In [None]:
df_pmids[df_pmids["pmids"] == 37089814]

In [None]:
papers_that_cite_af_cited_in_ct

In [None]:
papers_that_cite_af_cited_in_ct["strength"].iloc[3]

### Exploring labs outputs

In [4]:
pyi = catalog.load("lab.data_collection.publications.raw")

In [5]:
pyi.keys()

[1;35mdict_keys[0m[1m([0m[1m[[0m[32m's0'[0m, [32m's1'[0m, [32m's10'[0m, [32m's11'[0m, [32m's12'[0m, [32m's13'[0m, [32m's14'[0m, [32m's15'[0m, [32m's16'[0m, [32m's17'[0m, [32m's18'[0m, [32m's19'[0m, [32m's2'[0m, [32m's20'[0m, [32m's21'[0m, [32m's22'[0m, [32m's23'[0m, [32m's24'[0m, [32m's25'[0m, [32m's26'[0m, [32m's27'[0m, [32m's28'[0m, [32m's29'[0m, [32m's3'[0m, [32m's30'[0m, [32m's31'[0m, [32m's32'[0m, [32m's33'[0m, [32m's34'[0m, [32m's35'[0m, [32m's36'[0m, [32m's37'[0m, [32m's38'[0m, [32m's39'[0m, [32m's4'[0m, [32m's40'[0m, [32m's41'[0m, [32m's42'[0m, [32m's43'[0m, [32m's44'[0m, [32m's45'[0m, [32m's46'[0m, [32m's47'[0m, [32m's48'[0m, [32m's49'[0m, [32m's5'[0m, [32m's50'[0m, [32m's51'[0m, [32m's52'[0m, [32m's53'[0m, [32m's54'[0m, [32m's55'[0m, [32m's56'[0m, [32m's57'[0m, [32m's58'[0m, [32m's59'[0m, [32m's6'[0m, [32m's60'[0m, [32m's61'[0m, [32m's62'[0m, [32m's63

In [6]:
z = pyi["s0"]()

In [11]:
z["A5000005088"][0]


[1m{[0m
    [32m'id'[0m: [32m'https://openalex.org/W2166180044'[0m,
    [32m'doi'[0m: [32m'https://doi.org/10.1038/nm.2375'[0m,
    [32m'display_name'[0m: [32m'A clinical microchip for evaluation of single immune cells reveals high functional heterogeneity in phenotypically similar T cells'[0m,
    [32m'title'[0m: [32m'A clinical microchip for evaluation of single immune cells reveals high functional heterogeneity in phenotypically similar T cells'[0m,
    [32m'publication_date'[0m: [32m'2011-05-22'[0m,
    [32m'abstract'[0m: [32m'Cellular immunity has an inherent high level of functional heterogeneity. Capturing the full spectrum of these functions requires analysis of large numbers of effector molecules from single cells. We report a microfluidic platform designed for highly multiplexed [0m[32m([0m[32mmore than ten proteins[0m[32m)[0m[32m, reliable, sample-efficient [0m[32m([0m[32m∼1 × 10[0m[32m([0m[32m4[0m[32m)[0m[32m cells[0m[32m)[0m