[This notebook showing example of toy GO terms]

Rank all GO biological processes by the similarity with the LLM term. 

* % of other GO names have smaller semantic similarity with the GPT-4 name comparing to the assgined GO name



In [3]:
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
import pandas as pd
all_go = pd.read_csv('data/go_terms.csv', index_col=0)
print(all_go.shape)


(11943, 4)


## Step 1 get the word embeddings for all the go terms (only need to run once for all)

In [2]:
## create embeddings for all GO Terms and save the embeddings 
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
import pandas as pd

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

all_go = pd.read_csv('data/go_terms.csv', index_col=0)

all_go_terms = all_go['Term_Description'].tolist()

all_go_terms_embeddings_dict = {}
for i, go_term in enumerate(all_go_terms):
    tensor = getSentenceEmbedding(go_term, SapBERT_tokenizer, SapBERT_model)
    all_go_terms_embeddings_dict[go_term] = tensor.numpy()  # Convert to numpy array

import pickle
with open('data/all_go_terms_embeddings_dict.pkl', 'wb') as handle:  
    pickle.dump(all_go_terms_embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [3]:
## create embeddings for all GO Terms and save the embeddings 
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
import pandas as pd

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

for branch in ['CC', 'MF']:
    
    all_go = pd.read_csv(f'data/GO_term_analysis/CC_MF_branch/{branch}_go_terms.csv', index_col=0)

    all_go_terms = all_go['Term_Description'].tolist()

    all_go_terms_embeddings_dict = {}
    for i, go_term in enumerate(all_go_terms):
        tensor = getSentenceEmbedding(go_term, SapBERT_tokenizer, SapBERT_model)
        all_go_terms_embeddings_dict[go_term] = tensor.numpy()  # Convert to numpy array

    import pickle
    with open(f'data/GO_term_analysis/CC_MF_branch/{branch}_go_terms_embeddings_dict.pkl', 'wb') as handle:  
        pickle.dump(all_go_terms_embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
#check if embeddings are saved correctly
import pickle
with open('data/all_go_terms_embeddings_dict.pkl', 'rb') as handle:
    all_go_terms_embeddings_dict = pickle.load(handle)
print(len(all_go_terms_embeddings_dict))
# all_go_terms_embeddings_dict['cellular response to DNA damage stimulus']

11943


In [5]:
#check if embeddings are saved correctly
import pickle
for branch in ['CC', 'MF']:
    with open(f'data/GO_term_analysis/CC_MF_branch/{branch}_go_terms_embeddings_dict.pkl', 'rb') as handle:
        all_go_terms_embeddings_dict = pickle.load(handle)
    print(len(all_go_terms_embeddings_dict))
    

1677
3399


## Step2: iterate through each GO term and its corresponsing LLM term, rank the similarity score of the LLM with all GO terms and find where is the trueGO-LLM term is among the list


When running for the 1000 gene set, used the python function rank_GOterm_LLM_sim.py to run at the background

 ```
 python rank_GOterm_LLM_sim_rand.py --input_file data/rank_GOterm_LLM_sim_rand_new.py --input_file data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv --background_file data/GO_term_analysis/all_go_sim_scores_gpt4.txt
 ```
the code at the bottom is just an example

In [3]:
%run rank_GOterm_LLM_sim_rand.py --input_file ./data/GO_term_analysis/LLM_processed_toy_example_w_contamination_gpt_4.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file ./data/GO_term_analysis/simrank_LLM_processed_toy_example.tsv --background_file data/GO_term_analysis/all_go_sim_scores_toy.txt

  0%|          | 0/11 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


454


  9%|▉         | 1/11 [00:08<01:25,  8.56s/it]

2468
5539


 18%|█▊        | 2/11 [00:16<01:14,  8.24s/it]

11652
45


 27%|██▋       | 3/11 [00:23<01:02,  7.85s/it]

6117
156


 36%|███▋      | 4/11 [00:31<00:55,  7.88s/it]

10720
917


 45%|████▌     | 5/11 [00:39<00:46,  7.79s/it]

5592
10945


 55%|█████▍    | 6/11 [00:46<00:38,  7.66s/it]

3046
11


 64%|██████▎   | 7/11 [00:55<00:31,  7.88s/it]

6049
1185


 73%|███████▎  | 8/11 [01:03<00:23,  7.91s/it]

11189
5832


 82%|████████▏ | 9/11 [01:10<00:15,  7.86s/it]

11104
97


 91%|█████████ | 10/11 [01:18<00:07,  7.81s/it]

11626
Saved progress after 10 rows.
9919


100%|██████████| 11/11 [01:26<00:00,  7.83s/it]

9919
DONE





In [1]:
from glob import glob
files = glob('data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set*.tsv')
# print(files)
for file in files:
    model = '_'.join(file.split('/')[-1].split('.')[0].split('_')[-2:])
    print(model)
    if model == 'gpt_4':
        continue
    else:
        print(f'python rank_GOterm_LLM_sim_rand.py --input_file {file} --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/model_compare/sim_rank_LLM_processed_model_compare_100set_{model}.tsv')
    

gpt_4
gemini_pro
python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_gemini_pro.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/model_compare/sim_rank_LLM_processed_model_compare_100set_gemini_pro.tsv
mixtral_instruct
python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_mixtral_instruct.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/model_compare/sim_rank_LLM_processed_model_compare_100set_mixtral_instruct.tsv
llama2_70b
python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/model_compare/LLM_processed_model_compare_100set_llama2_70b.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/model_compare/sim_rank_LLM_processed_model_compare_100set_llama2_70b.tsv
gpt_35
python rank_GOterm_LLM_sim_rand.p

In [4]:
from glob import glob

# print(files)
for branch in ['CC', 'MF']:
    file = glob(f'data/GO_term_analysis/CC_MF_branch/LLM_processed_selected_1000*{branch}terms.tsv')
    
    print(f'python rank_GOterm_LLM_sim_rand.py --input_file {file[0]} --emb_file data/GO_term_analysis/CC_MF_branch/{branch}_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/CC_MF_branch/sim_rank_LLM_processed_selected_1000_go_{branch}terms.tsv --background_file data/GO_term_analysis/CC_MF_branch/{branch}_go_sim_scores.txt')
    

python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/CC_MF_branch/LLM_processed_selected_1000_go_CCterms.tsv --emb_file data/GO_term_analysis/CC_MF_branch/CC_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/CC_MF_branch/sim_rank_LLM_processed_selected_1000_go_CCterms.tsv --background_file data/GO_term_analysis/CC_MF_branch/CC_go_sim_scores.txt
python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/CC_MF_branch/LLM_processed_selected_1000_go_MFterms.tsv --emb_file data/GO_term_analysis/CC_MF_branch/MF_go_terms_embeddings_dict.pkl --topn 3 --output_file data/GO_term_analysis/CC_MF_branch/sim_rank_LLM_processed_selected_1000_go_MFterms.tsv --background_file data/GO_term_analysis/CC_MF_branch/MF_go_sim_scores.txt


In [5]:
# sanity check
df = pd.read_csv('data/GO_term_analysis/simrank_LLM_processed_toy_example.tsv', sep='\t', index_col=0)
df.head()

Unnamed: 0_level_0,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes,gpt_4_default Name,gpt_4_default Analysis,gpt_4_default Score,gpt_4_50perc_contaminated Name,gpt_4_50perc_contaminated Analysis,...,gpt_4_100perc_contaminated Score,LLM_name_GO_term_sim,sim_rank,true_GO_term_sim_percentile,random_GO_name,random_go_llm_sim,random_sim_rank,random_sim_percentile,top_3_hits,top_3_sim
GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0045940,STAR WNT4 ADM APOE CES1 STARD4 NR1D1 TNF FSHB ...,25,positive regulation of steroid metabolic process,WNT4 IL1A FSHB DAB2 IFNG ABCG1 APOA1 STAR CGA ...,CALD1 VPS29 MTRF1 MVP GTF2I CC2D1B EPHA7 SPACA...,Lipid Metabolism and Hormonal Regulation,"1. Several proteins in this system, such as ST...",0.92,System of unrelated proteins,The provided list of proteins encompasses a wi...,...,0.0,0.476209,454,0.961986,regulation of post-transcriptional gene silenc...,0.35424,2468,0.793352,regulation of lipid metabolic process|lipid ho...,0.8357932|0.8153909|0.8071369
GO:0010757,PLAU CTSZ THBS1 SERPINF2 SERPINE1 SERPINE2 CPB...,8,negative regulation of plasminogen activation,CPB2 SERPINE1 PLAU THBS1 ZNF737 SMDT1 XPNPEP2 ...,ADAMTS17 CEP63 LRRC18 SUPT3H FUBP1 KCNK4 RPL27...,Regulation of Fibrinolysis and Extracellular M...,1. PLAU (urokinase-type plasminogen activator)...,0.92,Regulation of Fibrinolysis and Extracellular M...,"1. CPB2, also known as carboxypeptidase B2 or ...",...,0.0,0.281846,5539,0.536214,negative regulation of T cell costimulation,0.12564,11652,0.024366,regulation of fibrinolysis|regulation of tissu...,0.84587914|0.73236907|0.6881492
GO:2000136,GNG5 TBX5 ISL1 RBPJ CTNNB1 NOTCH1 SMAD4 EYA1 B...,18,regulation of cell proliferation involved in h...,MKS1 BMP10 EYA1 SMAD4 HAND2 GNG5 RBPJ SIX1 ENG...,MGMT GPHN BANK1 NDP IL1F10 IKBKG PARD3 INCENP ...,Cardiac Development and Regulation,"1. TBX5, ISL1, TBX3, HAND2, and GNG5 are trans...",0.92,Cardiac Development and Signaling Pathway Inte...,"1. MKS1, EYA1, SIX1, and RBPJ are part of a de...",...,0.0,0.702308,45,0.996232,regulation of post-transcriptional gene silenc...,0.296795,6117,0.487817,regulation of cardiac muscle tissue developmen...,0.8482114|0.8481006|0.83557206
GO:0002433,PLPP4 LYN PRKCE APPL1 PRKCD FYN VAV1 YES1 MYO1...,22,immune response-regulating cell surface recept...,ABL1 VAV3 APPL2 LYN FGR SYK PRKCE PRKCD PLPP4 ...,JAML PRKCSH PIM1 EID2 EPO UBE4A MRPL9 ASB18 SE...,Signal Transduction and Actin Cytoskeleton Reg...,"1. The Src family kinases (SFKs), including LY...",0.92,Signal Transduction and Cytoskeletal Regulatio...,"1. ABL1, LYN, FGR, SYK, HCK, and SRC are non-r...",...,0.0,0.549966,156,0.986938,negative regulation of peptide secretion,0.191806,10720,0.102403,regulation of signal transduction|actin cytosk...,0.7147157|0.709465|0.7046939
GO:1990874,DBH NF1 ERN1 MMP2 HPGD IGFBP5 TGFB3 DDIT3 MAP3...,61,vascular associated smooth muscle cell prolife...,MAP3K7 PDGFB HPGD CDKN1A IGFBP5 EFEMP2 FGF9 TG...,GARNL3 OR1J1 SPP2 USP17L2 ARMH3 ANKRD13A HELZ ...,Cellular Stress Response and Tumor Suppression,"1. Several proteins in this system, such as ER...",0.85,Vascular and Tissue Remodeling,"1. Several proteins in this system, such as PD...",...,0.0,0.430808,917,0.923219,hydrocarbon catabolic process,0.30675,5592,0.531776,cellular response to stress|integrated stress ...,0.7512536|0.7383063|0.71038234


### Check the rank similarity result of the 1000 gene sets 

In [2]:
import pandas as pd

rank_sim_df = pd.read_csv('data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv', sep='\t')
## if duplicate
print(sum(rank_sim_df.duplicated(subset=['GO'])))
print(sum(rank_sim_df.duplicated(subset=['gpt_4_default Analysis'])))

#if nan

print(sum(rank_sim_df['true_GO_term_sim_percentile'].isna()))

# check how many gpt4 names are 'System of unrelated proteins'
print(sum(rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins'))
# for each of them set the semantic similarity to 0, rank to max number of GO terms, and percentile to 0
rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'LLM_name_GO_term_sim'] = 0
rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'sim_rank'] = 11943
rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'true_GO_term_sim_percentile'] = 0

print(sum(rank_sim_df['true_GO_term_sim_percentile']==0))
rank_sim_df.to_csv('data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv', sep='\t', index=False)


0
0
0
26
26


In [8]:
# change column order and remove the contaminated genes columns
rank_sim_df = rank_sim_df[['GO', 'Genes', 'Gene_Count', 'Term_Description',
       'gpt_4_default Name', 'gpt_4_default Analysis', 'gpt_4_default Score', 'gpt_4_default Score Bin',
       'LLM_name_GO_term_sim', 'sim_rank', 'true_GO_term_sim_percentile',
       'random_GO_name', 'random_go_llm_sim', 'random_sim_rank',
       'random_sim_percentile', 'top_3_hits', 'top_3_sim']]
rank_sim_df.columns

Index(['GO', 'Genes', 'Gene_Count', 'Term_Description', 'gpt_4_default Name',
       'gpt_4_default Analysis', 'gpt_4_default Score',
       'gpt_4_default Score Bin', 'LLM_name_GO_term_sim', 'sim_rank',
       'true_GO_term_sim_percentile', 'random_GO_name', 'random_go_llm_sim',
       'random_sim_rank', 'random_sim_percentile', 'top_3_hits', 'top_3_sim'],
      dtype='object')

In [7]:
rank_sim_df.to_csv('data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv', sep='\t', index=False)

In [24]:

## half point of the similarity distribution
rank_sim_sorted = rank_sim_df.sort_values(by='true_GO_term_sim_percentile', ascending=False)
print('half of the sample have the percentile score higher than: ',rank_sim_sorted.iloc[500-1]['true_GO_term_sim_percentile'])

## number of GO terms in top 10% of similarities
print('number of GO terms in top 10%: ', sum(1-rank_sim_df['true_GO_term_sim_percentile'] <= 0.1))
print('number of GO terms in bottom 10%: ', sum(1-rank_sim_df['true_GO_term_sim_percentile'] >= 0.9))

## number of GO terms ranked top 10 of similarities

print('number of GO terms ranked top 10: ', sum(rank_sim_df['sim_rank'] <= 10))

half of the sample have the percentile score higher than:  0.9794858913170896
number of GO terms in top 10%:  685
number of GO terms in bottom 10%:  41
number of GO terms ranked top 10:  123


In [10]:
# pick the example and calculate the similarity between the top 1st hit and the GO term 
df = pd.read_csv('data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv', sep='\t')
example = 'GO:0010897'
example_top_1st_hit = df.loc[df['GO'] == example, 'top_3_hits'].values[0].split('|')[0]
example_name = df.loc[df['GO'] == example, 'Term_Description'].values[0]
print(example_top_1st_hit, '\n', example_name)
#find the term in the embedding list 
with open('data/all_go_terms_embeddings_dict.pkl', 'rb') as handle:
    all_go_terms_embeddings_dict = pickle.load(handle)
top_hit_embedding = all_go_terms_embeddings_dict[example_top_1st_hit]
go_term_embedding = all_go_terms_embeddings_dict[example_name]
cosine_similarity(top_hit_embedding, go_term_embedding)[0][0]


lipid metabolic process 
 negative regulation of triglyceride catabolic process


0.40103704

In [25]:
# create a table for ones that output 'System of unrelated proteins'

unnamed_sets = rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins']
print(len(unnamed_sets)) ## 26 out of the 1000 GO terms are unnamed
# check the size 
print(unnamed_sets['Gene_Count'].describe()) # the unnamed sets are not dependent on the size 

unnamed_sets.to_csv('data/GO_term_analysis/sets_no_gpt4name.tsv', sep='\t', index=False)

26
count    26.000000
mean     23.500000
std      25.842987
min       3.000000
25%       5.250000
50%      11.000000
75%      32.750000
max      84.000000
Name: Gene_Count, dtype: float64


In [29]:
# filter out the unnamed sets
rank_sim_df_filter = rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] != 'System of unrelated proteins']
# check anything with score ==0
print(sum(rank_sim_df_filter['gpt_4_default Score'] == 0))

1


### MF and CC branch similarity

In [1]:
import pandas as pd

for branch in ['CC', 'MF']:
    print(branch)
    all_go = pd.read_csv(f'data/GO_term_analysis/CC_MF_branch/{branch}_go_terms.csv', index_col=0)
    # filtered go terms to size range 3-100 
    # since we picked term size from 3-100, it makes more sense to filter the go term pool to that size range
    min = 3
    max = 100
    filtered_go = all_go[(all_go['Gene_Count'] >= min) & (all_go['Gene_Count'] <= max)]
    max_rank = filtered_go.shape[0]
    
    rank_sim_df = pd.read_csv(f'data/GO_term_analysis/CC_MF_branch/sim_rank_LLM_processed_selected_1000_go_{branch}terms.tsv', sep='\t')
    ## if duplicate
    print(sum(rank_sim_df.duplicated(subset=['GO'])))
    print(sum(rank_sim_df.duplicated(subset=['gpt_4_default Analysis'])))

    #if nan

    print(sum(rank_sim_df['true_GO_term_sim_percentile'].isna()))

    # check how many gpt4 names are 'System of unrelated proteins'
    print(sum(rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins'))
    print(sum(rank_sim_df['gpt_4_default Score'] == 0))
    # for each of them set the semantic similarity to 0, rank to max number of GO terms, and percentile to 1
    rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'LLM_name_GO_term_sim'] = 0
    rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'sim_rank'] = max_rank
    rank_sim_df.loc[rank_sim_df['gpt_4_default Name'] == 'System of unrelated proteins', 'true_GO_term_sim_percentile'] = 0

    print(sum(rank_sim_df['true_GO_term_sim_percentile']==0))

    rank_sim_df.to_csv(f'data/GO_term_analysis/CC_MF_branch/sim_rank_LLM_processed_selected_1000_go_{branch}terms.tsv', sep='\t', index=False)


CC
0
0
0
17
17
17
MF
0
0
0
8
8
8
