In [4]:
import pandas as pd
pmid_dataset = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_2m/pmid_dataset.csv.gz')

In [6]:
import numpy as np
# read the .npz file
tkg_embeddings_all = np.load('/data/jx4237data/Graph-CoT/Pipeline/updated_data/tkg_embeddings_all.npz')

# access the data from the .npz file
embeddings = tkg_embeddings_all['embeddings']
ids = tkg_embeddings_all['ids']

tkg_embeddings_new = np.load('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_10k/tkg_updated_360k_embeddings_all.npz')
new_embeddings = tkg_embeddings_new['embeddings']
new_ids = tkg_embeddings_new['ids']
embeddings_dict = {id_: embedding for id_, embedding in zip(ids, embeddings)}
new_embeddings_dict = {str(id_): embedding for id_, embedding in zip(new_ids, new_embeddings)}
combined_embeddings_dict = {**embeddings_dict, **new_embeddings_dict}

In [17]:
list(combined_embeddings_dict.keys())[0]

'12108406'

In [19]:
pmid_dataset['PMID'] = pmid_dataset['PMID'].astype(str)

In [22]:
dataset_pmids = pmid_dataset.groupby('dataset')['PMID'].apply(list).to_dict()

In [23]:
import numpy as np

dataset_embeddings = {}
for dataset, pmids in dataset_pmids.items():
    embeddings = [combined_embeddings_dict[pmid] for pmid in pmids]
    dataset_embeddings[dataset] = np.sum(embeddings, axis=0)

In [31]:
# Protein-Protein Interaction Dataset (The Krogan Lab)
protein_protein_interaction_Dataset = 'protein_protein_interaction dataset Protein-Protein Interaction Analysis Proteins typically do not function alone, but in physical or functional interaction with other proteins, forming macromolecular complexes. The complex cellular network of protein interactions is highly organized in time and space, and adapts dynamically to external and internal perturbations to define the cell’s functional state. Consequently, characterizing protein interaction networks and their dynamic changes in response to perturbations can better our understanding of protein function. In the Krogan lab, we use a variety of different techniques to study PPIs: Affinity-Purification mass spectrometry (AP-MS): Requires an affinity tagged bait (3xFLAG, Strep, GFP, etc.). Endogenous protein immunoprecipitation: When we are only interested in identifying or quantifying interaction partners for a single protein and we have a good antibody for that protein. Proximity-Labeling MS (APEX-MS): Relies on fusion of a labeling enzyme (APEX, BirA, etc.) to the protein of interest. This method labels everything in ~10-20nm proximity of your fusion protein, but direct binding proteins, as well as those only in spatial proximity. Can be better for identifying transient interaction partners, at the cost of decreased specificity. We often use affinity purification combined with quantitative mass spectrometry methods to characterize protein interactions networks. We apply these approaches towards understanding how viruses hijack the cellular machinery for replication and infection, and how genetic mutations cause rewiring of proteins interactions networks leading to the development of cancer or neuronal, cardiac or psyciatric disorders.'
# Genetic Perturbations Dataset (Mali Lab)
Genetic_Perturbations_single_cell_dataset = 'Genetic Perturbations (single cell) dataset Mapping Cellular Reprogramming via Pooled Overexpression Screens with Paired Fitness and Single-Cell RNA-Sequencing Readout Discovering reprogramming factors for cell fate conversion is a challenging process. Here, we demonstrate a highthroughput, high-content overexpression screening method, employing a coupled single-cell RNA-seq and fitness readout, to screen transcription factor overexpression effects on pluripotent stem cells under multiple growth conditions. From the screens, we can dissect transcriptomic responses, construct genetic co-regulatory networks, and identify reprogramming factors. We also demonstrate application of the method to systematically screen mutant forms of proteins and whole gene families. Highlights d Transcription factor overexpression in hPSCs is screened by an scRNA-seq-based method d Transcriptomic responses enable the construction of genetic co-regulatory networks d Fitness readout identifies ETV2 as a reprogramming factor to an endothelial-like state d Screening method is also applied to mutant proteins and whole gene families. Understanding the effects of genetic perturbations on the cellular state has been challenging using traditional pooled screens, which typically rely on the delivery of a single perturbation per cell and unidimensional phenotypic readouts. Here, we use barcoded open reading frame overexpression libraries coupled with single-cell RNA sequencing to assay cell state and fitness, a technique we call SEUSS (scalable functional screening by sequencing). Using SEUSS, we perturbed hPSCs with a library of developmentally critical transcription factors (TFs) and assayed the impact of TF overexpression on fitness and transcriptomic states. We further leveraged the versatility of the ORF library approach to assay mutant genes and whole gene families. From the transcriptomic responses, we built genetic co-regulatory networks to identify altered gene modules and found that KLF4 and SNAI2 drive opposing effects along the epithelial-mesenchymal transition axis. From the fitness responses, we identified ETV2 as a driver of reprogramming toward an endothelial-like state.'
# Protein Imaging Dataset (Lundberg Lab)
protein_image_dataset = 'A global multiscale map of protein assemblies from integration of protein interactions and images Cells regulate growth and function through a hierarchical structure of subcellular protein assemblies, in which alterations can result in cellular dysfunction leading to disease such as cancer. Much of this structure remains uncharted, resulting in recent efforts to map subcellular organization at different physical scales. Here, we report a global architectural map of human cancer cell protein assemblies spanning 10−9 to 10−5 nm, based on integration of near-proteome-wide affinity purification mass spectrometry-based protein interactions and immunofluorescent imaging in U-2 OS osteosarcoma cancer cells. The U-2 OS multi-scale integrated cell map places >5000 proteins into 270 distinct subcellular protein assemblies across biological scales, representing approximately half of expressed proteins. There are known organelles and protein complexes recovered in the map, as well as 152 putative assemblies, such as a putative interferon signaling complex consisting of a serine protease and STAT transcription factors. The map also incorporates 128 previously uncharacterized proteins into protein assemblies, such as C18orf21 association with a canonical RNAse complex. Protein subsystems in the U-2 OS map were evaluated for the potential for downstream integrative structural modeling, where available structural data (e.g. crosslinking, Protein Data Bank structures, AlphaFold predictions) are combined in a Bayesian framework to model the physical structures of protein complexes. Using this framework, we created an integrated structural model of new proteins interacting with the Rag-Ragulator complex, which regulates MAPK and mTOR signaling. In summary, the global proteome architecture map provides a resource for cellular biology discovery and the systematic determination of protein functions and structures.'

In [33]:
import os

from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')

model.load_adapter("allenai/specter2_aug2023refresh", source="hf", load_as="proximity", set_active=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

docs = [protein_protein_interaction_Dataset, Genetic_Perturbations_single_cell_dataset, protein_image_dataset]
inputs = tokenizer(list(docs), padding=True, truncation=True,
                   return_tensors="pt", return_token_type_ids=False, max_length=512)

# Move all input tensors to the GPU
inputs = {key: value.to(device) for key, value in inputs.items()}

with torch.no_grad():  # Disable gradient calculations
    output = model(**inputs)
    batch_embeddings = output.last_hidden_state[:, 0, :].cpu()  # Move embeddings back to CPU

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 7646.86it/s]


In [43]:
dataset_embeddings['Protein-Protein Interaction Dataset (The Krogan Lab)'] = batch_embeddings[0].numpy()
dataset_embeddings['Genetic Perturbations Dataset (Mali Lab)'] = batch_embeddings[1].numpy()
dataset_embeddings['Protein Imaging Dataset (Lundberg Lab)'] = batch_embeddings[2].numpy()

In [44]:
index_to_dataset = {}
current_index = 123000000000
for dataset in dataset_embeddings.keys():
    index_to_dataset[current_index] = dataset
    current_index += 1

In [45]:
dataset_to_index = {dataset: index for index, dataset in index_to_dataset.items()}

In [48]:
# Save the dataset embeddings dataset_embeddings
import pickle
pickle.dump(dataset_embeddings, open('dataset_embeddings.pkl', 'wb'))

In [50]:
pickle.dump(index_to_dataset, open('index_to_dataset.pkl', 'wb'))

In [1]:
import pickle
dataset_embeddings = pickle.load(open('dataset_embeddings.pkl', 'rb'))
index_to_dataset = pickle.load(open('index_to_dataset.pkl', 'rb'))

In [2]:
import pandas as pd
authors = pd.read_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/processed_authors_more_than_single_recent_pub.csv')

In [5]:
pmid_dataset

Unnamed: 0,PMID,Dataset_Mention,Source,dataset
0,27985,MSDD,NAR,MSDD
1,41450,MSDD,NAR,MSDD
2,153152,Nred,NAR,NRED
3,699045,enzyme nomenclature,NAR,Enzyme Nomenclature
4,812599,Sisyphus,NAR,SISYPHUS
...,...,...,...,...
52576,38093336,PubMed,NAR,PubMed
52577,38093369,PubMed,NAR,PubMed
52578,38095056,ClinicalTrials.gov,NIH,ClinicalTrials.gov
52579,38095903,ClinicalTrials.gov,NIH,ClinicalTrials.gov


In [6]:
dataset_to_index = {dataset: index for index, dataset in index_to_dataset.items()}

In [None]:
pmid_dataset['dataset']

In [8]:
authors['is_author'] = 1

In [9]:
authors

Unnamed: 0,id,FullName,BeginYear,PaperNum,CM4AI,RecentYear,Index,is_author
0,225,Baba Inusa,2005,102,,2024,0,1
1,878,Rutger Schutten,2017,5,,2022,1,1
2,1098,Vardit Ravitsky (CM4AI),2002,192,1.0,2023,2,1
3,1352,Ursula Grohmann,1988,158,,2023,3,1
4,1592,Sharon R Lewin,1993,373,,2023,4,1
...,...,...,...,...,...,...,...,...
28395,843771129831,Jillian Mohan (CM4AI),2012,37,1.0,2024,28395,1
28396,83284832748327,Jiawei Xu (CM4AI),2021,5,1.0,2024,28396,1
28397,873811147837,Swathi Thaker (CM4AI),2005,23,1.0,2024,28397,1
28398,23553111988,Xiaoyu Zhao (CM4AI),2010,11,1.0,2024,28398,1


In [17]:
datasets_data = {
    'id': list(index_to_dataset.keys()),
    'FullName': list(index_to_dataset.values()),
    'BeginYear': [2020] * len(index_to_dataset),
    'PaperNum': [100] * len(index_to_dataset),
    'CM4AI': [0] * len(index_to_dataset),
    'RecentYear': [0] * len(index_to_dataset),
    'Index': [0] * len(index_to_dataset),
    'is_author': [0] * len(index_to_dataset)
}

df_datasets = pd.DataFrame(datasets_data)

# Append the new DataFrame to the original DataFrame
df_datasets = pd.concat([authors, df_datasets], ignore_index=True)

In [18]:
df_datasets['Index'] =df_datasets.index

In [19]:
df_datasets

Unnamed: 0,id,FullName,BeginYear,PaperNum,CM4AI,RecentYear,Index,is_author
0,225,Baba Inusa,2005,102,,2024,0,1
1,878,Rutger Schutten,2017,5,,2022,1,1
2,1098,Vardit Ravitsky (CM4AI),2002,192,1.0,2023,2,1
3,1352,Ursula Grohmann,1988,158,,2023,3,1
4,1592,Sharon R Lewin,1993,373,,2023,4,1
...,...,...,...,...,...,...,...,...
29536,123000001136,wFleaBase,2020,100,0.0,0,29536,0
29537,123000001137,yMGV - Yeast microarray global viewer,2020,100,0.0,0,29537,0
29538,123000001138,Protein-Protein Interaction Dataset (The Kroga...,2020,100,0.0,0,29538,0
29539,123000001139,Genetic Perturbations Dataset (Mali Lab),2020,100,0.0,0,29539,0


In [20]:
df_datasets.to_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset.csv', index=False)

# dataset collaborators

In [21]:
import pandas as pd
paper_author_2m = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_2m/paper_author.csv.gz')


In [22]:
paper_author_2m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17311660 entries, 0 to 17311659
Data columns (total 5 columns):
 #   Column     Dtype
---  ------     -----
 0   PMID       int64
 1   Au_Order   int64
 2   AID        int64
 3   AuthorNum  int64
 4   PubYear    int64
dtypes: int64(5)
memory usage: 660.4 MB


In [23]:
import pandas as pd
pmid_dataset = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_2m/pmid_dataset.csv.gz')

In [24]:
pmid_dataset['dataset_id'] = pmid_dataset['dataset'].apply(lambda x: dataset_to_index.get(x))

In [34]:
all_author_pd = pd.read_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset_filtered.csv')
aid_set = all_author_pd['id'].to_list()

In [35]:
aid_set = all_author_pd['id'].to_list()

In [36]:
len(aid_set)

29531

In [46]:
cm4ai_authors = all_author_pd[(all_author_pd['CM4AI'] == 1) & (all_author_pd['is_author'] == 1)]['id'].to_list()

In [38]:
paper_author_2m = paper_author_2m[paper_author_2m['AID'].isin(aid_set)]

In [39]:
merged_df = pd.merge(paper_author_2m, pmid_dataset, on='PMID')

# Group by dataset_id and collect unique AIDs
dataset_id2users = merged_df.groupby('dataset_id')['AID'].apply(lambda x: x.unique().tolist()).to_dict()

In [51]:
for id in set(index_to_dataset.keys()) - set(dataset_id2users.keys()):
    dataset_id2users[id] = cm4ai_authors

In [55]:
import pandas as pd
import pickle
import gzip

with gzip.open("author_collaborators.pkl.gz", "rb") as f:
    author_collab = pickle.load(f)

In [57]:
author_collab = { **author_collab, **dataset_id2users }

In [58]:
pickle.dump(author_collab, open('author_dataset_collaborators.pkl', 'wb'))

In [60]:
authors = pd.read_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset_filtered.csv')
author_set = set(authors['id'].to_list())
def clean_author_collab(author_collab, author_set):
    # Remove keys not in author_set
    keys_to_remove = [key for key in author_collab if key not in author_set]
    for key in keys_to_remove:
        del author_collab[key]

    # Remove values not in author_set
    for key in author_collab:
        author_collab[key] = [value for value in author_collab[key] if value in author_set]

# Clean the dictionary
clean_author_collab(author_collab, author_set)

In [63]:
import json
file_name = 'author_collab_dataset.json'

# Open the file in write mode and use json.dump to write the dictionary to the file
with open(file_name, 'w') as json_file:
    json.dump(author_collab, json_file, indent=4)

In [65]:
dataset_index_embeddings = {}
for index, dataset in index_to_dataset.items():
    dataset_index_embeddings[index] = dataset_embeddings[dataset]

In [67]:
with gzip.open("author_embeddings.pkl.gz", "rb") as f:
    author_ebd = pickle.load(f)
author_ebd_filtered = {key: value for key, value in author_ebd.items() if key in author_set}

import numpy as np

def get_top_n_similar_authors(author_ebd_filtered, dataset_id, N=5):
    # Check if the author_id exists in the dictionary
    if dataset_id not in dataset_index_embeddings:
        raise ValueError(f"Author ID {dataset_id} not found in the author embeddings.")
    
    # Get the embedding of the target author
    target_embedding = dataset_index_embeddings[dataset_id]
    
    # Initialize a list to store similarities
    similarities = []
    
    # Compute cosine similarity between the target author and all other authors
    for other_id, other_embedding in author_ebd_filtered.items():
        if other_id != dataset_id and  other_id not in author_collab[dataset_id]:  # Exclude the target author from the comparison
            cosine_similarity = np.dot(target_embedding, other_embedding) / (np.linalg.norm(target_embedding) * np.linalg.norm(other_embedding))
            similarities.append((other_id, cosine_similarity))
    
    # Sort the list by similarity in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_n_similar_authors = [author_id for author_id, _ in similarities[:N]]
    
    return top_n_similar_authors


In [68]:
import csv
import gzip
from tqdm import tqdm

import numpy as np

# Open a gzip-compressed file for writing
with gzip.open('dataset_author_recommendations.csv.gz', 'wt', newline='') as gzfile:
    writer = csv.writer(gzfile)
    
    # Write the header
    writer.writerow(['DatasetID', 'Recommended AID'])
    
    # Iterate over each author ID in author_collab
    for datasetid in tqdm(dataset_index_embeddings.keys()):
        # Get the top 30 recommended authors for the current author_id
        recommended_authors = get_top_n_similar_authors(author_ebd_filtered, datasetid, N=150)
        
        # Write each pair (author_id, recommended_id) to the CSV
        for recommended_id in recommended_authors:
            writer.writerow([datasetid, recommended_id])

100%|██████████| 1141/1141 [04:26<00:00,  4.29it/s]


In [69]:
df = pd.read_csv('dataset_author_recommendations.csv.gz', compression='gzip')
df.rename(columns={'Recommended AID': 'recommended_author_id'}, inplace=True)
df.to_csv('dataset_author_recommendations.csv.gz', index=False, compression='gzip')

In [70]:
df

Unnamed: 0,DatasetID,recommended_author_id
0,123000000000,9846538
1,123000000000,9383306
2,123000000000,9808746
3,123000000000,3902484
4,123000000000,1195250
...,...,...
171145,123000001140,11929809
171146,123000001140,1275250
171147,123000001140,3342808
171148,123000001140,5371637
