In [1]:
import pandas as pd
import numpy as np
from utils import extract_gene_symbol
all_genes = []
with open("./GeneLists/all_genes.txt","r") as f:
    all_genes = f.readlines()
f.close()
all_genes = [text.strip() for text in all_genes]


In [3]:
import random
random.seed(1234)
def process_msigdb_data_file(filename, all_genes):
    data = pd.read_csv(filename, sep = "\t")
    for index, row in data.iterrows():
        if row['num_of_genes'] <= 50:
            pos_genes = row['pos_rel_genes'].split(",")
            pos_genes = ["Gene_"+ gene for gene in pos_genes]
            neg_genes = row['neg_rel_genes'].split(",")
            neg_genes = ["Gene_"+ gene for gene in neg_genes]
            pos_genes = np.intersect1d(all_genes, pos_genes)
            neg_genes = np.intersect1d(all_genes, neg_genes)
            n = min(len(pos_genes), len(neg_genes))
            data.iloc[index, 1] = n
            data.iloc[index, 2] = ",".join(random.sample(set(pos_genes), n))
            data.iloc[index, 3] = ",".join(random.sample(set(neg_genes), n))
    return data


In [None]:
# process msigdb dataset to fit our gene tags
data = process_msigdb_data_file("./Datasets/c2.all.v6.2.entrez_term-gene_dataset.tsv",all_genes)
data = data[(data.num_of_genes <= 50) & (data.num_of_genes >= 10)]
print(data.shape)
print(data.head())
data.to_csv("./Datasets/c2.all.v6.2.entrez_term-gene_dataset.tsv", index=False, sep ="\t")

In [80]:
# process msigdb dataset to fit our gene tags
data = process_msigdb_data_file("./Datasets/c4.all.v6.2.entrez_term-gene_dataset.tsv",all_genes)
data = data[(data.num_of_genes <= 50) & (data.num_of_genes >= 10)]
print(data.shape)
print(data.head())
data.to_csv("./Datasets/c4.all.v6.2.entrez_term-gene_dataset.tsv", index=False, sep ="\t")

since Python 3.9 and will be removed in a subsequent version.
  data.iloc[index, 2] = ",".join(random.sample(set(pos_genes), n))
since Python 3.9 and will be removed in a subsequent version.
  data.iloc[index, 3] = ",".join(random.sample(set(neg_genes), n))


(141, 4)
            id  num_of_genes  \
1    GNF2_SELL            10   
4    GNF2_MCL1            17   
16   GNF2_CD97            13   
17  MODULE_329            16   
18  GNF2_CKS1B            12   

                                        pos_rel_genes  \
1   Gene_834,Gene_4067,Gene_5062,Gene_4615,Gene_92...   
4   Gene_54918,Gene_5791,Gene_3689,Gene_6402,Gene_...   
16  Gene_3716,Gene_3587,Gene_5788,Gene_101,Gene_36...   
17  Gene_8048,Gene_2702,Gene_2314,Gene_59,Gene_800...   
18  Gene_55143,Gene_890,Gene_5901,Gene_3832,Gene_2...   

                                        neg_rel_genes  
1   Gene_4738,Gene_10472,Gene_7444,Gene_25932,Gene...  
4   Gene_214,Gene_3918,Gene_5756,Gene_84666,Gene_6...  
16  Gene_8692,Gene_8111,Gene_623,Gene_2737,Gene_38...  
17  Gene_636,Gene_1910,Gene_10551,Gene_1647,Gene_1...  
18  Gene_390,Gene_26762,Gene_6657,Gene_1896,Gene_5...  


In [8]:
from utils import cosine
def compute_geneset_similarity(geneset, model, standardize = False):
    pairs = [(i,j) for i in geneset for j in geneset]
    sims = []
    for pair in pairs:
        sims.append(cosine(model[pair[0]],model[pair[1]]))
    if standardize:
        sims = standardize_similarity(sims)
    return np.mean(sims)

def standardize_similarity(sims):
    avg = np.mean(sims)
    sd = np.std(sims)
    sims = (sims - avg) / sd
    sims = (sims - min(sims))/(max(sims) - min(sims))
    return sims

def compute_dataset_similarity(datafile, model, standardize = False):
    data = pd.read_csv(datafile, sep = "\t")
    pos_sims = []
    neg_sims = []
    for index,row in data.iterrows():
        pos_set = row['pos_rel_genes'].split(",")
        neg_set = row['neg_rel_genes'].split(",")
        pos_sims.append(compute_geneset_similarity(pos_set, model = model, standardize = standardize))
        neg_sims.append(compute_geneset_similarity(neg_set, model = model, standardize = standardize))
    return np.mean(pos_sims), np.mean(neg_sims), np.mean((np.array(pos_sims) - np.array(neg_sims)) * 100) 
    



In [5]:
from gensim.models  import KeyedVectors, Word2Vec
from utils import load_embedding
from time import time

t = time()
w2v_cbow = load_embedding("./WordVectors/Computed/word2vec_cbow.bin", binary=True)
print("Time to load cbow embeddings in mins: ", round(((time() - t)/60.0),4))

t = time()
w2v_sg = load_embedding("./WordVectors/Computed/word2vec_skipgram.bin", binary=True)
print("Time to load skipgram embeddings in mins: ", round(((time() - t)/60.0),4))

embedding loaded from ./WordVectors/Computed/word2vec_cbow.bin
Time to load cbow embeddings in mins:  0.0034
embedding loaded from ./WordVectors/Computed/word2vec_skipgram.bin
Time to load skipgram embeddings in mins:  0.003


In [15]:
model = w2v_cbow
avg_pos_sim, avg_neg_sim, perc_diff = compute_dataset_similarity("./Datasets/c2.all.v6.2.entrez_term-gene_dataset.tsv", model=model, standardize=False)
print("[CBOW, C2] ===== > Avg pos sim: {0:.4f}\t Avg neg sim: {1:.4f}\t Diff: {2:.4f}%".format(avg_pos_sim, avg_neg_sim, perc_diff))
avg_pos_sim, avg_neg_sim, perc_diff = compute_dataset_similarity("./Datasets/c4.all.v6.2.entrez_term-gene_dataset.tsv", model=model, standardize=False)
print("[CBOW, C4] ===== > Avg pos sim: {0:.4f}\t Avg neg sim: {1:.4f}\t Diff: {2:.4f}%".format(avg_pos_sim, avg_neg_sim, perc_diff))

[CBOW, C2] ===== > Avg pos sim: 0.4178	 Avg neg sim: 0.3270	 Diff: 9.0729%
[CBOW, C4] ===== > Avg pos sim: 0.4305	 Avg neg sim: 0.3293	 Diff: 10.1182%


In [14]:
model = w2v_sg
avg_pos_sim, avg_neg_sim, perc_diff = compute_dataset_similarity("./Datasets/c2.all.v6.2.entrez_term-gene_dataset.tsv", model=model, standardize=False)
print("[SG, C2] ===== > Avg pos sim: {0:.4f}\t Avg neg sim: {1:.4f}\t Diff: {2:.4f}%".format(avg_pos_sim, avg_neg_sim, perc_diff))
avg_pos_sim, avg_neg_sim, perc_diff = compute_dataset_similarity("./Datasets/c4.all.v6.2.entrez_term-gene_dataset.tsv", model=model, standardize=False)
print("[SG, C4] ===== > Avg pos sim: {0:.4f}\t Avg neg sim: {1:.4f}\t Diff: {2:.4f}%".format(avg_pos_sim, avg_neg_sim, perc_diff))

[SG, C2] ===== > Avg pos sim: 0.4247	 Avg neg sim: 0.3303	 Diff: 9.4422%
[SG, C4] ===== > Avg pos sim: 0.4331	 Avg neg sim: 0.3324	 Diff: 10.0699%


In [85]:
def read_genes_from_file(filename):
    genes = []
    with open(filename, "r") as f:
        genes = f.readlines()
    f.close()
    genes = [gene.strip() for gene in genes]
    return genes


In [88]:
collagen_genes = np.intersect1d(read_genes_from_file("./GeneLists/collagen_genes.txt"), all_genes)
m1m2_genes = np.intersect1d(read_genes_from_file("./GeneLists/m1m2_genes.txt"), all_genes)
fibrinogen_genes = np.intersect1d(read_genes_from_file("./GeneLists/fibrinogen_genes.txt"), all_genes)
muscle_reg_genes = np.intersect1d(read_genes_from_file("./GeneLists/muscle_regeneration_genes.txt"), all_genes)
neural_reg_genes = np.intersect1d(read_genes_from_file("./GeneLists/neural_regeneration_genes.txt"), all_genes)

In [96]:
model = w2v_cbow
print("[CBOW, Collagens] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(collagen_genes, model, True))
print("[CBOW, Fibrigens] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(fibrinogen_genes, model, True))
print("[CBOW, M1/M2] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(m1m2_genes, model, True))
print("[CBOW, Muscle Reg] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(muscle_reg_genes, model, True))
print("[CBOW, Neural_reg] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(neural_reg_genes, model, True))

[CBOW, Collagens] ====> Avg Sim: {0: .2f} 0.547803
[CBOW, Fibrigens] ====> Avg Sim: {0: .2f} 0.43907523
[CBOW, M1/M2] ====> Avg Sim: {0: .2f} 0.49272108
[CBOW, Muscle Reg] ====> Avg Sim: {0: .2f} 0.4087264
[CBOW, Neural_reg] ====> Avg Sim: {0: .2f} 0.421225


In [97]:
model = w2v_sg
print("[CBOW, Collagens] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(collagen_genes, model, True))
print("[CBOW, Fibrigens] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(fibrinogen_genes, model, True))
print("[CBOW, M1/M2] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(m1m2_genes, model, True))
print("[CBOW, Muscle Reg] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(muscle_reg_genes, model, True))
print("[CBOW, Neural_reg] ====> Avg Sim: {0: .2f}", compute_geneset_similarity(neural_reg_genes, model, True))

[CBOW, Collagens] ====> Avg Sim: {0: .2f} 0.5546585
[CBOW, Fibrigens] ====> Avg Sim: {0: .2f} 0.37193477
[CBOW, M1/M2] ====> Avg Sim: {0: .2f} 0.49427536
[CBOW, Muscle Reg] ====> Avg Sim: {0: .2f} 0.4087264
[CBOW, Neural_reg] ====> Avg Sim: {0: .2f} 0.37672463
