In [None]:
from net2rank.utils import H5Loader
import os
import pandas as pd

In [3]:
embeddings = H5Loader('../data/9606.node2vec64.h5')
num_proteins = len(embeddings.proteins)

In [3]:
records = []
for file in os.listdir('../data/train'):
    
    if not file.endswith('.tsv'):
        continue
    
    disease_name = file.split('.')[0]
    data_type = file.split('.')[1]
    
    # open the file
    if disease_name == 'atopic_dermatitis':
        if data_type != 'integrated':
            continue
        else:
            num_pos = 1000
    elif disease_name == 'ulcerative_colitis':
        if data_type != 'integrated':
            continue
        else:
            num_pos = 1300
    else:
        num_pos = len(open(f'../data/train/{file}').readlines())

    num_neg = num_proteins - num_pos
    
    #
    disease_name = disease_name.replace('_', ' ').title()
    
    records.append((disease_name, data_type, num_pos, num_neg))
    
pd.DataFrame(records, columns=['disease', 'type', 'num_pos', 'num_neg'])

Unnamed: 0,disease,type,num_pos,num_neg
0,Focal Epilepsy,rnaseq,1106,18516
1,Melanoma,mutations,83,19539
2,Atopic Dermatitis,integrated,1000,18622
3,Ulcerative Colitis,integrated,1300,18322
4,Diffuse Large B-Cell Lymphoma,mutations,86,19536
5,Colorectal Adenocarcinoma,mutations,82,19540


In [4]:
records = []
for file in os.listdir('../data/test'):
    
    if not file.endswith('.tsv'):
        continue
    
    disease_name = file.split('.')[0]
    
    df_disease = pd.read_csv(f'../data/test/{file}', sep='\t', header=None, names=['protein', 'label'])
    num_pos = len(df_disease[df_disease['label'] == 1])
    num_neg = len(df_disease[df_disease['label'] == 0])
    
    disease_name = disease_name.replace('_', ' ').title()
    records.append((disease_name, num_pos, num_neg))
pd.DataFrame(records, columns=['disease', 'num_pos', 'num_neg'])

Unnamed: 0,disease,num_pos,num_neg
0,Focal Epilepsy,121,121
1,Aortic Aneurysm,98,98
2,Colorectal Adenocarcinoma,49,49
3,Melanoma,404,404
4,Ulcerative Colitis,209,209
5,Atopic Dermatitis,188,188
6,Diffuse Large B-Cell Lymphoma,274,274


In [5]:
## jaccard index of two inflammatory diseases 
ad_set = set(pd.read_csv('../data/train/atopic_dermatitis.integrated.tsv', sep='\t')['protein'][:1000])
uc_set = set(pd.read_csv('../data/train/ulcerative_colitis.integrated.tsv', sep='\t')['protein'][:1300])
jaccard_index = len(ad_set.intersection(uc_set)) / len(ad_set.union(uc_set))
print(f'Jaccard index of Atopic Dermatitis and Ulcerative Colitis: {jaccard_index:.4f}')

# jaccard index of three cancers
melanoma_set = set(open('../data/train/melanoma.mutations.intogen.tsv').readlines())
colorectal_set = set(open('../data/train/colorectal_adenocarcinoma.mutations.intogen.tsv').readlines())
lymphoma_set = set(open('../data/train/diffuse_large_b-cell_lymphoma.mutations.intogen.tsv').readlines())
# jaccard index of any two cancers
melanoma_colorectal_jaccard = len(melanoma_set.intersection(colorectal_set)) / len(melanoma_set.union(colorectal_set))
melanoma_lymphoma_jaccard = len(melanoma_set.intersection(lymphoma_set)) / len(melanoma_set.union(lymphoma_set))
colorectal_lymphoma_jaccard = len(colorectal_set.intersection(lymphoma_set)) / len(colorectal_set.union(lymphoma_set))
print(f'Jaccard index of Melanoma and Colorectal Adenocarcinoma: {melanoma_colorectal_jaccard:.4f}')
print(f'Jaccard index of Melanoma and Diffuse Large B-cell Lymphoma: {melanoma_lymphoma_jaccard:.4f}')
print(f'Jaccard index of Colorectal Adenocarcinoma and Diffuse Large B-cell Lymphoma: {colorectal_lymphoma_jaccard:.4f}')

Jaccard index of Atopic Dermatitis and Ulcerative Colitis: 0.0604
Jaccard index of Melanoma and Colorectal Adenocarcinoma: 0.1301
Jaccard index of Melanoma and Diffuse Large B-cell Lymphoma: 0.0903
Jaccard index of Colorectal Adenocarcinoma and Diffuse Large B-cell Lymphoma: 0.0701


In [3]:
records = []
for disease_name in os.listdir('../results/test_results'):
    if disease_name == '.DS_Store':
        continue
    file_path = f'../results/test_results/{disease_name}/{disease_name}_fpr_0.05_prediction_results.tsv' 
    df = pd.read_csv(file_path, sep='\t')
    num_proteins = len(df)
    num_training = len(df[df['category'] == 'training'])
    num_text_mining = len(df[df['category'] == 'text_mining'])
    num_novel = len(df[df['category'] == 'novel'])
    num_both = len(df[df['category'] == 'both'])
    num_training = num_training + num_both
    num_text_mining = num_text_mining + num_both
    records.append((disease_name, num_proteins, num_training, num_text_mining, num_both,num_novel))
df_results = pd.DataFrame(records, columns=['disease', 'num_proteins', 'num_training', 'num_text_mining','both', 'num_novel'])
# order the diseases by this order
disease_order = ['atopic_dermatitis', 'ulcerative_colitis', 
                 'colorectal_adenocarcinoma', 'diffuse_large_b-cell_lymphoma',
                 'melanoma', 'focal_epilepsy', 'aortic_aneurysm',]
df_results = df_results.set_index('disease').reindex(disease_order).reset_index()
df_results

Unnamed: 0,disease,num_proteins,num_training,num_text_mining,both,num_novel
0,atopic_dermatitis,1717,370,157,68,1258
1,ulcerative_colitis,2647,448,160,50,2089
2,colorectal_adenocarcinoma,2213,76,46,13,2104
3,diffuse_large_b-cell_lymphoma,3183,85,266,53,2885
4,melanoma,3320,81,236,12,3015
5,focal_epilepsy,1329,280,67,13,995
6,aortic_aneurysm,996,67,45,7,891
