This notebook contains the code that was used to perform the ablation study.  This is part of our project to reproduce the paper titled: "Text2Mol: Cross-Modal Molecule Retrieval with Natural Language Queries" by Carl Edwards, ChengXiang Zhai, and Heng Ji (see [1]).

- Course: CS598 Deep Learning for Healthcare (Spring 2024)
- Project Team:
  1. Emily Meyer (emilynm4@illinois.edu)
  2. Anna Wysocka (annamw2@illinois.edu)
  3. Jake Kugel (jakugel2@illinois.edu)

  Approximate runtime: 30 minutes (no GPU used)

In [None]:
# Record notebook start time
import time
start_time = time.time()

In [None]:
# CHANGE START
# This cell downloads data needed for this notebook, and stores it in the runtime's local folder, /content.
# It is safe to run this cell more than once, the download and unzip will be skipped after the initial run.
import gdown
import os

def download(id, zip_name, folder_name):
    """
    Downloads and unzips a zip file (must be shared as public) from Google Drive.
    If the zip was already downloaded, the download and unzip will be skipped.

    Parameters:
    id (str): The ID of the file to download.  This can be found in the URL of the file.
    zip_name (str): The name and path of the zip file to download.
    folder_name (str): The name of the folder to extract the zip file to. If the folder does not exist, it will be created.

    Returns:
    None
    """
    if os.path.exists(folder_name):
        print(f'The archive {zip_name} has already been downloaded and extracted to {folder_name}, skipping.')
    else:
        gdown.download(id=id, output=zip_name)
        !unzip $zip_name -d $folder_name
        os.remove(zip_name)
        print('Download and unzip complete!')

download('1rezM24hhHwMoRWIushQ28po7PoYbw6Er', "/content/text2mol.zip", "/content/text2mol")
os.chdir('/content/text2mol')
# CHANGE END

Downloading...
From (original): https://drive.google.com/uc?id=1rezM24hhHwMoRWIushQ28po7PoYbw6Er
From (redirected): https://drive.google.com/uc?id=1rezM24hhHwMoRWIushQ28po7PoYbw6Er&confirm=t&uuid=e595734f-20ea-49a7-b829-51b0330dc045
To: /content/text2mol.zip
100%|██████████| 1.87G/1.87G [00:19<00:00, 97.8MB/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/text2mol/graph-data/raw/91858170.graph  
  inflating: /content/text2mol/graph-data/raw/91858171.graph  
  inflating: /content/text2mol/graph-data/raw/91858178.graph  
  inflating: /content/text2mol/graph-data/raw/91858184.graph  
  inflating: /content/text2mol/graph-data/raw/91858190.graph  
  inflating: /content/text2mol/graph-data/raw/91858192.graph  
  inflating: /content/text2mol/graph-data/raw/91858201.graph  
  inflating: /content/text2mol/graph-data/raw/91858205.graph  
  inflating: /content/text2mol/graph-data/raw/91858207.graph  
  inflating: /content/text2mol/graph-data/raw/91858209.graph  
  inflating: /content/text2mol/graph-data/raw/91858213.graph  
  inflating: /content/text2mol/graph-data/raw/91858217.graph  
  inflating: /content/text2mol/graph-data/raw/91858218.graph  
  inflating: /content/text2mol/graph-data/raw/91858220.graph  
  inflating: /content/text2mol/graph-data/raw/9185822

In [None]:
import os
import shutil

import numpy as np

import matplotlib.pyplot as plt

import math

In [None]:
dir1 = "inputs/GCN1/embeddings/"
dir2 = "inputs/GCN2/embeddings/"
dir3 = "inputs/GCN3/embeddings/"
dir4 = "inputs/MLP1/embeddings/"
dir5 = "inputs/MLP2/embeddings/"
dir6 = "inputs/MLP3/embeddings/"

# CHANGE START
dir7 = "inputs/GCN4/embeddings/"
dir8 = "inputs/MLP4/embeddings/"
# CHANGE END

cids_train1 = np.load(dir1 + "cids_train.npy", allow_pickle=True)
cids_val1 = np.load(dir1 + "cids_val.npy", allow_pickle=True)
cids_test1 = np.load(dir1 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train1 = np.load(dir1 + "chem_embeddings_train.npy")
chem_embeddings_val1 = np.load(dir1 + "chem_embeddings_val.npy")
chem_embeddings_test1 = np.load(dir1 + "chem_embeddings_test.npy")
text_embeddings_train1 = np.load(dir1 + "text_embeddings_train.npy")
text_embeddings_val1 = np.load(dir1 + "text_embeddings_val.npy")
text_embeddings_test1 = np.load(dir1 + "text_embeddings_test.npy")

cids_train2 = np.load(dir2 + "cids_train.npy", allow_pickle=True)
cids_val2 = np.load(dir2 + "cids_val.npy", allow_pickle=True)
cids_test2 = np.load(dir2 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train2 = np.load(dir2 + "chem_embeddings_train.npy")
chem_embeddings_val2 = np.load(dir2 + "chem_embeddings_val.npy")
chem_embeddings_test2 = np.load(dir2 + "chem_embeddings_test.npy")
text_embeddings_train2 = np.load(dir2 + "text_embeddings_train.npy")
text_embeddings_val2 = np.load(dir2 + "text_embeddings_val.npy")
text_embeddings_test2 = np.load(dir2 + "text_embeddings_test.npy")

cids_train3 = np.load(dir3 + "cids_train.npy", allow_pickle=True)
cids_val3 = np.load(dir3 + "cids_val.npy", allow_pickle=True)
cids_test3 = np.load(dir3 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train3 = np.load(dir3 + "chem_embeddings_train.npy")
chem_embeddings_val3 = np.load(dir3 + "chem_embeddings_val.npy")
chem_embeddings_test3 = np.load(dir3 + "chem_embeddings_test.npy")
text_embeddings_train3 = np.load(dir3 + "text_embeddings_train.npy")
text_embeddings_val3 = np.load(dir3 + "text_embeddings_val.npy")
text_embeddings_test3 = np.load(dir3 + "text_embeddings_test.npy")

cids_train4 = np.load(dir4 + "cids_train.npy", allow_pickle=True)
cids_val4 = np.load(dir4 + "cids_val.npy", allow_pickle=True)
cids_test4 = np.load(dir4 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train4 = np.load(dir4 + "chem_embeddings_train.npy")
chem_embeddings_val4 = np.load(dir4 + "chem_embeddings_val.npy")
chem_embeddings_test4 = np.load(dir4 + "chem_embeddings_test.npy")
text_embeddings_train4 = np.load(dir4 + "text_embeddings_train.npy")
text_embeddings_val4 = np.load(dir4 + "text_embeddings_val.npy")
text_embeddings_test4 = np.load(dir4 + "text_embeddings_test.npy")

cids_train5 = np.load(dir5 + "cids_train.npy", allow_pickle=True)
cids_val5 = np.load(dir5 + "cids_val.npy", allow_pickle=True)
cids_test5 = np.load(dir5 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train5 = np.load(dir5 + "chem_embeddings_train.npy")
chem_embeddings_val5 = np.load(dir5 + "chem_embeddings_val.npy")
chem_embeddings_test5 = np.load(dir5 + "chem_embeddings_test.npy")
text_embeddings_train5 = np.load(dir5 + "text_embeddings_train.npy")
text_embeddings_val5 = np.load(dir5 + "text_embeddings_val.npy")
text_embeddings_test5 = np.load(dir5 + "text_embeddings_test.npy")

cids_train6 = np.load(dir6 + "cids_train.npy", allow_pickle=True)
cids_val6 = np.load(dir6 + "cids_val.npy", allow_pickle=True)
cids_test6 = np.load(dir6 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train6 = np.load(dir6 + "chem_embeddings_train.npy")
chem_embeddings_val6 = np.load(dir6 + "chem_embeddings_val.npy")
chem_embeddings_test6 = np.load(dir6 + "chem_embeddings_test.npy")
text_embeddings_train6 = np.load(dir6 + "text_embeddings_train.npy")
text_embeddings_val6 = np.load(dir6 + "text_embeddings_val.npy")
text_embeddings_test6 = np.load(dir6 + "text_embeddings_test.npy")

# CHANGE START
cids_train7 = np.load(dir7 + "cids_train.npy", allow_pickle=True)
cids_val7 = np.load(dir7 + "cids_val.npy", allow_pickle=True)
cids_test7 = np.load(dir7 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train7 = np.load(dir7 + "chem_embeddings_train.npy")
chem_embeddings_val7 = np.load(dir7 + "chem_embeddings_val.npy")
chem_embeddings_test7 = np.load(dir7 + "chem_embeddings_test.npy")
text_embeddings_train7 = np.load(dir7 + "text_embeddings_train.npy")
text_embeddings_val7 = np.load(dir7 + "text_embeddings_val.npy")
text_embeddings_test7 = np.load(dir7 + "text_embeddings_test.npy")

cids_train8 = np.load(dir8 + "cids_train.npy", allow_pickle=True)
cids_val8 = np.load(dir8 + "cids_val.npy", allow_pickle=True)
cids_test8 = np.load(dir8 + "cids_test.npy", allow_pickle=True)

chem_embeddings_train8 = np.load(dir8 + "chem_embeddings_train.npy")
chem_embeddings_val8 = np.load(dir8 + "chem_embeddings_val.npy")
chem_embeddings_test8 = np.load(dir8 + "chem_embeddings_test.npy")
text_embeddings_train8 = np.load(dir8 + "text_embeddings_train.npy")
text_embeddings_val8 = np.load(dir8 + "text_embeddings_val.npy")
text_embeddings_test8 = np.load(dir8 + "text_embeddings_test.npy")
# CHANGE END

#Reorder (this is very important):
tmp = cids_train2.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val2.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test2.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train2 = cids_train2[indexes]
cids_val2 = cids_val2[indexes_val]
cids_test2 = cids_test2[indexes_test]

chem_embeddings_train2 = chem_embeddings_train2[indexes]
text_embeddings_train2 = text_embeddings_train2[indexes]
chem_embeddings_val2 = chem_embeddings_val2[indexes_val]
text_embeddings_val2 = text_embeddings_val2[indexes_val]
chem_embeddings_test2 = chem_embeddings_test2[indexes_test]
text_embeddings_test2 = text_embeddings_test2[indexes_test]


tmp = cids_train3.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val3.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test3.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train3 = cids_train3[indexes]
cids_val3 = cids_val3[indexes_val]
cids_test3 = cids_test3[indexes_test]

chem_embeddings_train3 = chem_embeddings_train3[indexes]
text_embeddings_train3 = text_embeddings_train3[indexes]
chem_embeddings_val3 = chem_embeddings_val3[indexes_val]
text_embeddings_val3 = text_embeddings_val3[indexes_val]
chem_embeddings_test3 = chem_embeddings_test3[indexes_test]
text_embeddings_test3 = text_embeddings_test3[indexes_test]


tmp = cids_train4.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val4.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test4.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train4 = cids_train4[indexes]
cids_val4 = cids_val4[indexes_val]
cids_test4 = cids_test4[indexes_test]

chem_embeddings_train4 = chem_embeddings_train4[indexes]
text_embeddings_train4 = text_embeddings_train4[indexes]
chem_embeddings_val4 = chem_embeddings_val4[indexes_val]
text_embeddings_val4 = text_embeddings_val4[indexes_val]
chem_embeddings_test4 = chem_embeddings_test4[indexes_test]
text_embeddings_test4 = text_embeddings_test4[indexes_test]


tmp = cids_train5.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val5.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test5.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train5 = cids_train5[indexes]
cids_val5 = cids_val5[indexes_val]
cids_test5 = cids_test5[indexes_test]

chem_embeddings_train5 = chem_embeddings_train5[indexes]
text_embeddings_train5 = text_embeddings_train5[indexes]
chem_embeddings_val5 = chem_embeddings_val5[indexes_val]
text_embeddings_val5 = text_embeddings_val5[indexes_val]
chem_embeddings_test5 = chem_embeddings_test5[indexes_test]
text_embeddings_test5 = text_embeddings_test5[indexes_test]


tmp = cids_train6.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val6.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test6.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train6 = cids_train6[indexes]
cids_val6 = cids_val6[indexes_val]
cids_test6 = cids_test6[indexes_test]

chem_embeddings_train6 = chem_embeddings_train6[indexes]
text_embeddings_train6 = text_embeddings_train6[indexes]
chem_embeddings_val6 = chem_embeddings_val6[indexes_val]
text_embeddings_val6 = text_embeddings_val6[indexes_val]
chem_embeddings_test6 = chem_embeddings_test6[indexes_test]
text_embeddings_test6 = text_embeddings_test6[indexes_test]

# CHANGE START
tmp = cids_train7.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val7.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test7.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train7 = cids_train7[indexes]
cids_val7 = cids_val7[indexes_val]
cids_test7 = cids_test7[indexes_test]

chem_embeddings_train7 = chem_embeddings_train7[indexes]
text_embeddings_train7 = text_embeddings_train7[indexes]
chem_embeddings_val7 = chem_embeddings_val7[indexes_val]
text_embeddings_val7 = text_embeddings_val7[indexes_val]
chem_embeddings_test7 = chem_embeddings_test7[indexes_test]
text_embeddings_test7 = text_embeddings_test7[indexes_test]

tmp = cids_train8.tolist()
indexes = [tmp.index(i) for i in cids_train1]
tmp = cids_val8.tolist()
indexes_val = [tmp.index(i) for i in cids_val1]
tmp = cids_test8.tolist()
indexes_test = [tmp.index(i) for i in cids_test1]

cids_train8 = cids_train8[indexes]
cids_val8 = cids_val8[indexes_val]
cids_test8 = cids_test8[indexes_test]

chem_embeddings_train8 = chem_embeddings_train8[indexes]
text_embeddings_train8 = text_embeddings_train8[indexes]
chem_embeddings_val8 = chem_embeddings_val8[indexes_val]
text_embeddings_val8 = text_embeddings_val8[indexes_val]
chem_embeddings_test8 = chem_embeddings_test8[indexes_test]
text_embeddings_test8 = text_embeddings_test8[indexes_test]
# CHANGE END

all_chem_embbedings1 = np.concatenate((chem_embeddings_train1, chem_embeddings_val1, chem_embeddings_test1), axis = 0)
all_chem_embbedings2 = np.concatenate((chem_embeddings_train2, chem_embeddings_val2, chem_embeddings_test2), axis = 0)
all_chem_embbedings3 = np.concatenate((chem_embeddings_train3, chem_embeddings_val3, chem_embeddings_test3), axis = 0)
all_chem_embbedings4 = np.concatenate((chem_embeddings_train4, chem_embeddings_val4, chem_embeddings_test4), axis = 0)
all_chem_embbedings5 = np.concatenate((chem_embeddings_train5, chem_embeddings_val5, chem_embeddings_test5), axis = 0)
all_chem_embbedings6 = np.concatenate((chem_embeddings_train6, chem_embeddings_val6, chem_embeddings_test6), axis = 0)

# CHANGE START
all_chem_embbedings7 = np.concatenate((chem_embeddings_train7, chem_embeddings_val7, chem_embeddings_test7), axis = 0)
all_chem_embbedings8 = np.concatenate((chem_embeddings_train8, chem_embeddings_val8, chem_embeddings_test8), axis = 0)
# CHANGE END


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def memory_efficient_similarity_matrix_custom(func, embedding1, embedding2, chunk_size = 1000):
    rows = embedding1.shape[0]

    num_chunks = int(np.ceil(rows / chunk_size))

    for i in range(num_chunks):
        end_chunk = (i+1)*(chunk_size) if (i+1)*(chunk_size) < rows else rows #account for smaller chunk at end...
        yield func(embedding1[i*chunk_size:end_chunk,:], embedding2)


text_chem_cos1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train1, all_chem_embbedings1)
text_chem_cos_val1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val1, all_chem_embbedings1)
text_chem_cos_test1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test1, all_chem_embbedings1)

text_chem_cos2 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train2, all_chem_embbedings2)
text_chem_cos_val2 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val2, all_chem_embbedings2)
text_chem_cos_test2 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test2, all_chem_embbedings2)

text_chem_cos3 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train3, all_chem_embbedings3)
text_chem_cos_val3 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val3, all_chem_embbedings3)
text_chem_cos_test3 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test3, all_chem_embbedings3)

text_chem_cos4 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train4, all_chem_embbedings4)
text_chem_cos_val4 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val4, all_chem_embbedings4)
text_chem_cos_test4 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test4, all_chem_embbedings4)

text_chem_cos5 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train5, all_chem_embbedings5)
text_chem_cos_val5 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val5, all_chem_embbedings5)
text_chem_cos_test5 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test5, all_chem_embbedings5)

text_chem_cos6 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train6, all_chem_embbedings6)
text_chem_cos_val6 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val6, all_chem_embbedings6)
text_chem_cos_test6 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test6, all_chem_embbedings6)

# CHANGE START
text_chem_cos7 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train7, all_chem_embbedings7)
text_chem_cos_val7 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val7, all_chem_embbedings7)
text_chem_cos_test7 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test7, all_chem_embbedings7)

text_chem_cos8 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train8, all_chem_embbedings8)
text_chem_cos_val8 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val8, all_chem_embbedings8)
text_chem_cos_test8 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test8, all_chem_embbedings8)
# CHANGE END

In [None]:
n_train = len(cids_train1)
n_val = len(cids_val2)
n_test = len(cids_test1)
n = n_train + n_val + n_test

offset_val = n_train
offset_test = n_train + n_val

cids_all = np.concatenate((cids_train1, cids_val1, cids_test1), axis = 0)

In [None]:
tr_ranks_avg = np.zeros((n_train, n))

In [None]:
val_avg_ranks = np.zeros((n_val, n))
test_avg_ranks = np.zeros((n_test, n))

In [None]:
#For space 1:


tr_correct1 = np.zeros(len(cids_train1))

hits_at_one = 0
hits_at_ten = 0
hits_at_100 = 0
hits_at_500 = 0
hits_at_1000 = 0
ranks1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs) #rank is actually double argsort.

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks


        tr_correct1[j] = ranks[j] + 1
        rank = ranks[j] + 1
        ranks1.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks1 = np.array(ranks1)

print()
print("Training Mean rank:", np.mean(ranks1))
print("Hits at 1:", np.mean(ranks1 <= 1))
print("Hits at 10:", np.mean(ranks1 <= 10))
print("Hits at 100:", np.mean(ranks1 <= 100))
print("Hits at 500:", np.mean(ranks1 <= 500))
print("Hits at 1000:", np.mean(ranks1 <= 1000))

print("Trainng MRR:", np.mean(1/np.array(ranks1)))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.7838533777643137
Hits at 1: 0.42562859739472886
Hits at 10: 0.9326340502877916
Hits at 100: 0.9996591941835807
Hits at 500: 1.0
Hits at 1000: 1.0
Trainng MRR: 0.5978082812849088


In [None]:

ranks_val1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val1.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val1 = np.array(ranks_val1)

print()
print("Val Mean rank:", np.mean(ranks_val1))
print("Hits at 1:", np.mean(ranks_val1 <= 1))
print("Hits at 10:", np.mean(ranks_val1 <= 10))
print("Hits at 100:", np.mean(ranks_val1 <= 100))
print("Hits at 500:", np.mean(ranks_val1 <= 500))
print("Hits at 1000:", np.mean(ranks_val1 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val1))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 29.38018782187216
Hits at 1: 0.3232353832172069
Hits at 10: 0.8218721599515298
Hits at 100: 0.9733414116934263
Hits at 500: 0.9933353529233565
Hits at 1000: 0.9966676764616783
Validation MRR: 0.4868585047434113


In [None]:


ranks_test1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test1.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test1 = np.array(ranks_test1)

print()
print("Test Mean rank:", np.mean(ranks_test1))
print("Hits at 1:", np.mean(ranks_test1 <= 1))
print("Hits at 10:", np.mean(ranks_test1 <= 10))
print("Hits at 100:", np.mean(ranks_test1 <= 100))
print("Hits at 500:", np.mean(ranks_test1 <= 500))
print("Hits at 1000:", np.mean(ranks_test1 <= 1000))

print("Test MRR:", np.mean(1/ranks_test1))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 26.039684943956377
Hits at 1: 0.3374734928809452
Hits at 10: 0.8288397455316571
Hits at 100: 0.97364435019691
Hits at 500: 0.9936382914268403
Hits at 1000: 0.9963647379581945
Test MRR: 0.4998553337137297


In [None]:
#For space 2:

ranks2 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos2):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks2.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks2 = np.array(ranks2)

print()
print("Training Mean rank:", np.mean(ranks2))
print("Hits at 1:", np.mean(ranks2 <= 1))
print("Hits at 10:", np.mean(ranks2 <= 10))
print("Hits at 100:", np.mean(ranks2 <= 100))
print("Hits at 500:", np.mean(ranks2 <= 500))
print("Hits at 1000:", np.mean(ranks2 <= 1000))

print("Training MRR:", np.mean(1/ranks2))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.715427143289912
Hits at 1: 0.43615571039079065
Hits at 10: 0.9336186004241139
Hits at 100: 0.9994319903059679
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.6069887371158339


In [None]:

ranks_val2 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val2):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val2.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val2 = np.array(ranks_val2)

print()
print("Val Mean rank:", np.mean(ranks_val2))
print("Hits at 1:", np.mean(ranks_val2 <= 1))
print("Hits at 10:", np.mean(ranks_val2 <= 10))
print("Hits at 100:", np.mean(ranks_val2 <= 100))
print("Hits at 500:", np.mean(ranks_val2 <= 500))
print("Hits at 1000:", np.mean(ranks_val2 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val2))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 29.148742805210542
Hits at 1: 0.334747046349591
Hits at 10: 0.8209633444410784
Hits at 100: 0.9724325961829748
Hits at 500: 0.9939412299303242
Hits at 1000: 0.996970614965162
Validation MRR: 0.4991055092653233


In [None]:

ranks_test2 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test2):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test2.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test2 = np.array(ranks_test2)

print()
print("Test Mean rank:", np.mean(ranks_test2))
print("Hits at 1:", np.mean(ranks_test2 <= 1))
print("Hits at 10:", np.mean(ranks_test2 <= 10))
print("Hits at 100:", np.mean(ranks_test2 <= 100))
print("Hits at 500:", np.mean(ranks_test2 <= 500))
print("Hits at 1000:", np.mean(ranks_test2 <= 1000))

print("Test MRR:", np.mean(1/ranks_test2))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 25.815813389881853
Hits at 1: 0.34413813995758863
Hits at 10: 0.8270221145107544
Hits at 100: 0.97364435019691
Hits at 500: 0.9924265374129052
Hits at 1000: 0.9957588609512269
Test MRR: 0.504086855686544


In [None]:
#For space 3:

ranks3 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos3):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks3.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks3 = np.array(ranks3)

print()
print("Training Mean rank:", np.mean(ranks3))
print("Hits at 1:", np.mean(ranks3 <= 1))
print("Hits at 10:", np.mean(ranks3 <= 10))
print("Hits at 100:", np.mean(ranks3 <= 100))
print("Hits at 500:", np.mean(ranks3 <= 500))
print("Hits at 1000:", np.mean(ranks3 <= 1000))

print("Training MRR:", np.mean(1/ranks3))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.7252726446531352
Hits at 1: 0.4282035746743411
Hits at 10: 0.9344138139957588
Hits at 100: 0.9995455922447744
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.6001099259851892


In [None]:

ranks_val3 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val3):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val3.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val3 = np.array(ranks_val3)

print()
print("Val Mean rank:", np.mean(ranks_val3))
print("Hits at 1:", np.mean(ranks_val3 <= 1))
print("Hits at 10:", np.mean(ranks_val3 <= 10))
print("Hits at 100:", np.mean(ranks_val3 <= 100))
print("Hits at 500:", np.mean(ranks_val3 <= 500))
print("Hits at 1000:", np.mean(ranks_val3 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val3))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 27.35049984853075
Hits at 1: 0.32414419872765826
Hits at 10: 0.8146016358679188
Hits at 100: 0.9700090881551046
Hits at 500: 0.9915177219024538
Hits at 1000: 0.9960617994547107
Validation MRR: 0.48746504602420726


In [None]:
tmp = cids_all[np.argsort(val_avg_ranks[1396,:])]
print(tmp)
print(np.where(tmp == '45359507')[0][0] + 1)

['70698233' '50908215' '70678760' ... '364' '97328' '3301']
5796


In [None]:

ranks_test3 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test3):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test3.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test3 = np.array(ranks_test3)

print()
print("Test Mean rank:", np.mean(ranks_test3))
print("Hits at 1:", np.mean(ranks_test3 <= 1))
print("Hits at 10:", np.mean(ranks_test3 <= 10))
print("Hits at 100:", np.mean(ranks_test3 <= 100))
print("Hits at 500:", np.mean(ranks_test3 <= 500))
print("Hits at 1000:", np.mean(ranks_test3 <= 1000))

print("Test MRR:", np.mean(1/ranks_test3))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 25.154195698273252
Hits at 1: 0.33838230839139655
Hits at 10: 0.830354438049076
Hits at 100: 0.9730384731899424
Hits at 500: 0.9933353529233565
Hits at 1000: 0.996970614965162
Test MRR: 0.5008305760925637


In [None]:
#For space 4:

ranks4 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos4):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks4.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks4 = np.array(ranks4)

print()
print("Training Mean rank:", np.mean(ranks4))
print("Hits at 1:", np.mean(ranks4 <= 1))
print("Hits at 10:", np.mean(ranks4 <= 10))
print("Hits at 100:", np.mean(ranks4 <= 100))
print("Hits at 500:", np.mean(ranks4 <= 500))
print("Hits at 1000:", np.mean(ranks4 <= 1000))

print("Training MRR:", np.mean(1/ranks4))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.5492275068161163
Hits at 1: 0.44160860345349895
Hits at 10: 0.9382005452893063
Hits at 100: 1.0
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.6111997206165773


In [None]:

ranks_val4 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val4):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks


        rank = ranks[j+offset_val] + 1
        ranks_val4.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val4 = np.array(ranks_val4)

print()
print("Val Mean rank:", np.mean(ranks_val4))
print("Hits at 1:", np.mean(ranks_val4 <= 1))
print("Hits at 10:", np.mean(ranks_val4 <= 10))
print("Hits at 100:", np.mean(ranks_val4 <= 100))
print("Hits at 500:", np.mean(ranks_val4 <= 500))
print("Hits at 1000:", np.mean(ranks_val4 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val4))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 30.87943047561345
Hits at 1: 0.3314147228112693
Hits at 10: 0.8370190851257194
Hits at 100: 0.9730384731899424
Hits at 500: 0.99121478339897
Hits at 1000: 0.9963647379581945
Validation MRR: 0.49951891835195567


In [None]:

ranks_test4 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test4):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test4.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test4 = np.array(ranks_test4)

print()
print("Test Mean rank:", np.mean(ranks_test4))
print("Hits at 1:", np.mean(ranks_test4 <= 1))
print("Hits at 10:", np.mean(ranks_test4 <= 10))
print("Hits at 100:", np.mean(ranks_test4 <= 100))
print("Hits at 500:", np.mean(ranks_test4 <= 500))
print("Hits at 1000:", np.mean(ranks_test4 <= 1000))

print("Test MRR:", np.mean(1/ranks_test4))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 27.783398970009088
Hits at 1: 0.33383823083913966
Hits at 10: 0.8409572856710088
Hits at 100: 0.9760678582247804
Hits at 500: 0.9924265374129052
Hits at 1000: 0.9954559224477431
Test MRR: 0.5027490140721781


In [None]:
#For space 5:

ranks5 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos5):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks5.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks5 = np.array(ranks5)

print()
print("Training Mean rank:", np.mean(ranks5))
print("Hits at 1:", np.mean(ranks5 <= 1))
print("Hits at 10:", np.mean(ranks5 <= 10))
print("Hits at 100:", np.mean(ranks5 <= 100))
print("Hits at 500:", np.mean(ranks5 <= 500))
print("Hits at 1000:", np.mean(ranks5 <= 1000))

print("Training MRR:", np.mean(1/ranks5))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.6999394122993032
Hits at 1: 0.4267267494698576
Hits at 10: 0.9336564677370494
Hits at 100: 0.9999621326870646
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.5997151843584806


In [None]:

ranks_val5 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val5):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val5.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val5 = np.array(ranks_val5)

print()
print("Val Mean rank:", np.mean(ranks_val5))
print("Hits at 1:", np.mean(ranks_val5 <= 1))
print("Hits at 10:", np.mean(ranks_val5 <= 10))
print("Hits at 100:", np.mean(ranks_val5 <= 100))
print("Hits at 500:", np.mean(ranks_val5 <= 500))
print("Hits at 1000:", np.mean(ranks_val5 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val5))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 29.025143895789157
Hits at 1: 0.32838533777643136
Hits at 10: 0.8267191760072705
Hits at 100: 0.9697061496516207
Hits at 500: 0.9906089063920024
Hits at 1000: 0.9954559224477431
Validation MRR: 0.4913466068478193


In [None]:

ranks_test5 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test5):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test5.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test5 = np.array(ranks_test5)

print()
print("Test Mean rank:", np.mean(ranks_test5))
print("Hits at 1:", np.mean(ranks_test5 <= 1))
print("Hits at 10:", np.mean(ranks_test5 <= 10))
print("Hits at 100:", np.mean(ranks_test5 <= 100))
print("Hits at 500:", np.mean(ranks_test5 <= 500))
print("Hits at 1000:", np.mean(ranks_test5 <= 1000))

print("Test MRR:", np.mean(1/ranks_test5))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 25.854589518327778
Hits at 1: 0.34232050893668586
Hits at 10: 0.8282338685246895
Hits at 100: 0.9748561042108452
Hits at 500: 0.9924265374129052
Hits at 1000: 0.9945471069372918
Test MRR: 0.5034708663289943


In [None]:
#For space 6:

ranks6 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos6):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks6.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks6 = np.array(ranks6)

print()
print("Training Mean rank:", np.mean(ranks6))
print("Hits at 1:", np.mean(ranks6 <= 1))
print("Hits at 10:", np.mean(ranks6 <= 10))
print("Hits at 100:", np.mean(ranks6 <= 100))
print("Hits at 500:", np.mean(ranks6 <= 500))
print("Hits at 1000:", np.mean(ranks6 <= 1000))

print("Training MRR:", np.mean(1/ranks6))

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.6095501363223264
Hits at 1: 0.4347546198121781
Hits at 10: 0.9359663738261133
Hits at 100: 0.999924265374129
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.606411704411532


In [None]:

ranks_val6 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val6):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val6.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val6 = np.array(ranks_val6)

print()
print("Val Mean rank:", np.mean(ranks_val6))
print("Hits at 1:", np.mean(ranks_val6 <= 1))
print("Hits at 10:", np.mean(ranks_val6 <= 10))
print("Hits at 100:", np.mean(ranks_val6 <= 100))
print("Hits at 500:", np.mean(ranks_val6 <= 500))
print("Hits at 1000:", np.mean(ranks_val6 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val6))

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 28.72432596182975
Hits at 1: 0.3398970009088155
Hits at 10: 0.8279309300212057
Hits at 100: 0.9697061496516207
Hits at 500: 0.9921235989094214
Hits at 1000: 0.9960617994547107
Validation MRR: 0.503280663610302


In [None]:

ranks_test6 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test6):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test6.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test6 = np.array(ranks_test6)

print()
print("Test Mean rank:", np.mean(ranks_test6))
print("Hits at 1:", np.mean(ranks_test6 <= 1))
print("Hits at 10:", np.mean(ranks_test6 <= 10))
print("Hits at 100:", np.mean(ranks_test6 <= 100))
print("Hits at 500:", np.mean(ranks_test6 <= 500))
print("Hits at 1000:", np.mean(ranks_test6 <= 1000))

print("Test MRR:", np.mean(1/ranks_test6))

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 21.968494395637684
Hits at 1: 0.35383217206907
Hits at 10: 0.8448954862162981
Hits at 100: 0.9742502272038777
Hits at 500: 0.9945471069372918
Hits at 1000: 0.9972735534686459
Test MRR: 0.514169838664429


In [None]:
# CHANGE START
#For space 7 (using training data)
ranks7 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos7):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks7.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks7 = np.array(ranks7)

print()
print("Training Mean rank:", np.mean(ranks7))
print("Hits at 1:", np.mean(ranks7 <= 1))
print("Hits at 10:", np.mean(ranks7 <= 10))
print("Hits at 100:", np.mean(ranks7 <= 100))
print("Hits at 500:", np.mean(ranks7 <= 500))
print("Hits at 1000:", np.mean(ranks7 <= 1000))

print("Training MRR:", np.mean(1/ranks7))

# CHANGE END

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.7364435019691005
Hits at 1: 0.4250227203877613
Hits at 10: 0.9311950923962435
Hits at 100: 0.9998863980611936
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.5971187137536587


In [None]:
# CHANGE START
#For space 7 (using validation data)
ranks_val7 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val7):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val7.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val7 = np.array(ranks_val7)

print()
print("Val Mean rank:", np.mean(ranks_val7))
print("Hits at 1:", np.mean(ranks_val7 <= 1))
print("Hits at 10:", np.mean(ranks_val7 <= 10))
print("Hits at 100:", np.mean(ranks_val7 <= 100))
print("Hits at 500:", np.mean(ranks_val7 <= 500))
print("Hits at 1000:", np.mean(ranks_val7 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val7))
# CHANGE END

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 32.90487730990609
Hits at 1: 0.3262647682520448
Hits at 10: 0.810057558315662
Hits at 100: 0.9684943956376856
Hits at 500: 0.99121478339897
Hits at 1000: 0.9948500454407755
Validation MRR: 0.4859228792865391


In [None]:
# CHANGE START
#For space 7 (using test data)
ranks_test7 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test7):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test7.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test7 = np.array(ranks_test7)

print()
print("Test Mean rank:", np.mean(ranks_test7))
print("Hits at 1:", np.mean(ranks_test7 <= 1))
print("Hits at 10:", np.mean(ranks_test7 <= 10))
print("Hits at 100:", np.mean(ranks_test7 <= 100))
print("Hits at 500:", np.mean(ranks_test7 <= 500))
print("Hits at 1000:", np.mean(ranks_test7 <= 1000))

print("Test MRR:", np.mean(1/ranks_test7))
# CHANGE END

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 26.340805816419266
Hits at 1: 0.337776431384429
Hits at 10: 0.8185398364132082
Hits at 100: 0.9745531657073614
Hits at 500: 0.9930324144198728
Hits at 1000: 0.9963647379581945
Test MRR: 0.4964937560063823


In [None]:
# CHANGE START
#For space 8 (using training data)
ranks8 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos8):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        tr_ranks_avg[j,:] = tr_ranks_avg[j,:] + ranks

        rank = ranks[j] + 1
        ranks8.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks8 = np.array(ranks8)

print()
print("Training Mean rank:", np.mean(ranks8))
print("Hits at 1:", np.mean(ranks8 <= 1))
print("Hits at 10:", np.mean(ranks8 <= 10))
print("Hits at 100:", np.mean(ranks8 <= 100))
print("Hits at 500:", np.mean(ranks8 <= 500))
print("Hits at 1000:", np.mean(ranks8 <= 1000))

print("Training MRR:", np.mean(1/ranks8))

# CHANGE END

1000 train processed.
2000 train processed.
3000 train processed.
4000 train processed.
5000 train processed.
6000 train processed.
7000 train processed.
8000 train processed.
9000 train processed.
10000 train processed.
11000 train processed.
12000 train processed.
13000 train processed.
14000 train processed.
15000 train processed.
16000 train processed.
17000 train processed.
18000 train processed.
19000 train processed.
20000 train processed.
21000 train processed.
22000 train processed.
23000 train processed.
24000 train processed.
25000 train processed.
26000 train processed.

Training Mean rank: 3.596334444107846
Hits at 1: 0.4384656164798546
Hits at 10: 0.9377461375340805
Hits at 100: 0.9999621326870646
Hits at 500: 1.0
Hits at 1000: 1.0
Training MRR: 0.6086929781573421


In [None]:
# CHANGE START
#For space 8 (using validation data)
ranks_val8 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val8):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        val_avg_ranks[j,:] = val_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_val] + 1
        ranks_val8.append(rank)


        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")


ranks_val8 = np.array(ranks_val8)

print()
print("Val Mean rank:", np.mean(ranks_val8))
print("Hits at 1:", np.mean(ranks_val8 <= 1))
print("Hits at 10:", np.mean(ranks_val8 <= 10))
print("Hits at 100:", np.mean(ranks_val8 <= 100))
print("Hits at 500:", np.mean(ranks_val8 <= 500))
print("Hits at 1000:", np.mean(ranks_val8 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val8))
# CHANGE END

1000 val processed.
2000 val processed.
3000 val processed.

Val Mean rank: 30.926991820660405
Hits at 1: 0.3417146319297183
Hits at 10: 0.8291426840351409
Hits at 100: 0.9706149651620721
Hits at 500: 0.9906089063920024
Hits at 1000: 0.9960617994547107
Validation MRR: 0.5034985999968774


In [None]:
# CHANGE START
#For space 8 (using test data)
ranks_test8 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test8):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs)

        test_avg_ranks[j,:] = test_avg_ranks[j,:] + ranks

        rank = ranks[j+offset_test] + 1
        ranks_test8.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")

ranks_test8 = np.array(ranks_test8)

print()
print("Test Mean rank:", np.mean(ranks_test8))
print("Hits at 1:", np.mean(ranks_test8 <= 1))
print("Hits at 10:", np.mean(ranks_test8 <= 10))
print("Hits at 100:", np.mean(ranks_test8 <= 100))
print("Hits at 500:", np.mean(ranks_test8 <= 500))
print("Hits at 1000:", np.mean(ranks_test8 <= 1000))

print("Test MRR:", np.mean(1/ranks_test8))
# CHANGE END

1000 test processed.
2000 test processed.
3000 test processed.

Test Mean rank: 22.768857921841867
Hits at 1: 0.334747046349591
Hits at 10: 0.8324750075734626
Hits at 100: 0.9784913662526508
Hits at 500: 0.9948500454407755
Hits at 1000: 0.9981823689790973
Test MRR: 0.5012808306175419


#Rerank from sum

In [None]:

sorted = np.argsort(tr_ranks_avg)
new_tr_ranks = np.diag(np.argsort(sorted)) + 1

print(np.mean(new_tr_ranks))
print("%1:", np.mean(new_tr_ranks <= 1))
print("%10:", np.mean(new_tr_ranks <= 10))
print("%100:", np.mean(new_tr_ranks <= 100))

print("Trainng MRR:", np.mean(1/np.array(new_tr_ranks)))


2.532755225689185
%1: 0.5624431990305968
%10: 0.9718645864889427
%100: 1.0
Trainng MRR: 0.7125535926418244


In [None]:


sorted = np.argsort(val_avg_ranks)
val_final_ranks = np.argsort(sorted) + 1
new_val_ranks = np.diag(val_final_ranks[:,offset_val:offset_test])

print(np.mean(new_val_ranks))
print("%1:", np.mean(new_val_ranks <= 1))
print("%10:", np.mean(new_val_ranks <= 10))
print("%100:", np.mean(new_val_ranks <= 100))

print("Validation MRR:", np.mean(1/np.array(new_val_ranks)))


20.578915480157526
%1: 0.42108451984247197
%10: 0.8912450772493183
%100: 0.9794001817631021
Validation MRR: 0.5832640411758


In [None]:

sorted = np.argsort(test_avg_ranks)
test_final_ranks = np.argsort(sorted) + 1
new_test_ranks = np.diag(test_final_ranks[:,offset_test:])

print(np.mean(new_test_ranks))
print("%1:", np.mean(new_test_ranks <= 1))
print("%10:", np.mean(new_test_ranks <= 10))
print("%100:", np.mean(new_test_ranks <= 100))

print("Test MRR:", np.mean(1/new_test_ranks))


15.76279915177219
%1: 0.445925477128143
%10: 0.8873068767040291
%100: 0.9860648288397456
Test MRR: 0.5994412861701751


In [None]:
# Record notebook end time and print elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Elapsed time: 1918.55 seconds
