In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree
import faiss
import scipy

In [2]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from Bio import SeqIO

sequence_file = '/mnt/home/thamamsy/ceph/scrap/cafa/embeddings/Train/train_sequences.fasta'
record_ids = []
record_seqs = []

with open(sequence_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        record_ids.append(record.id)
        record_seqs.append(str(record.seq))
        
lengths = [len(record_seqs[i]) for i in range(len(record_seqs))]
record_df = pd.DataFrame({'Id': record_ids, 'Seq': record_seqs, 'Length': lengths})
record_df_below_seq = record_df[record_df['Length'] < 2500]
sequences = list(record_df_below_seq['Seq'].values)

In [3]:
len(record_ids)

142246

In [4]:
#record_df_below_seq

In [5]:
len(sequences)

140478

In [6]:
#record_df

In [6]:
train_terms = pd.read_csv("/mnt/home/thamamsy/ceph/scrap/cafa/embeddings/Train/train_terms.tsv", sep = '\t')

In [7]:
train_terms

Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO
2,A0A009IHW8,GO:0072523,BPO
3,A0A009IHW8,GO:0044270,BPO
4,A0A009IHW8,GO:0006753,BPO
...,...,...,...
5363858,X5L565,GO:0050649,MFO
5363859,X5L565,GO:0016491,MFO
5363860,X5M5N0,GO:0005515,MFO
5363861,X5M5N0,GO:0005488,MFO


In [8]:
len(train_terms.term.unique())

31466

In [9]:
train_embeddings = np.load('../../cafa/tm_vec_train.npy')
#test_embeddings =
# Now lets convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = train_embeddings.shape[1]
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)], 
                        index = record_df_below_seq.Id)
print(train_df.shape)

(140478, 512)


In [34]:
df = train_df.sample(frac=1)
train_df1 = df.iloc[:len(train_df) * 8//10]
test_df1 = df.iloc[len(train_df) * 8//10:]
# Convert the DataFrame to a NumPy array
data_array = train_df1.to_numpy(dtype='float32')
# Build the BallTree index
index = faiss.IndexFlatL2(data_array.shape[1])  # L2 distance is used for the BallTree
index.add(data_array)

# Convert the test DataFrame to a NumPy array
test_array = test_df1.to_numpy(dtype='float32')

# Perform nearest neighbor search with the BallTree index
k = 1  # Number of nearest neighbors to find
distances, indices = index.search(test_array, k)

# distances: array of shape (num_queries, k) containing the distances to the k nearest neighbors
# indices: array of shape (num_queries, k) containing the indices of the k nearest neighbors in the original data

# Now we have the nearest neighbors' indices and distances for each test sample.
# access the corresponding data points in the original DataFrame as follows:
nearest_neighbors_df = train_df1.iloc[indices[:, 0]]

In [83]:
nearest_neighbors_df

Unnamed: 0_level_0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_503,Column_504,Column_505,Column_506,Column_507,Column_508,Column_509,Column_510,Column_511,Column_512
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q2F922,-1.418940,-0.868099,0.715212,-0.672144,0.046930,0.566642,0.840719,0.838151,-0.647730,-0.061730,...,-0.400778,1.029336,1.317459,-1.138005,-0.659701,-0.547679,0.671604,0.199940,0.812962,0.983339
P0ACG4,-1.733634,-0.479552,1.123491,0.057367,-0.634735,-1.466390,0.357748,1.296579,-0.587296,-0.469951,...,-1.340290,1.005502,0.003859,-1.507618,0.280576,-1.495717,-1.114776,-0.456751,-0.372399,0.187828
F1R9G5,-2.483528,0.219796,0.762714,1.209975,-0.766722,0.962693,-1.068817,0.773379,-0.532433,0.604654,...,-0.476525,0.786754,0.283060,-0.039663,-1.207704,-1.472482,0.797554,0.197788,-0.705257,0.442182
Q5K4L6,1.685514,-0.298108,2.870711,-2.682812,-2.824287,2.353664,-1.630231,1.031535,1.416975,1.903122,...,-0.482224,3.592381,-1.787926,3.761530,-2.827449,1.468741,-2.740567,2.957629,1.207672,0.893299
Q8IBE5,-0.809251,-0.783404,2.162811,1.196074,-0.427149,0.887493,-0.490968,2.279024,0.047895,1.043612,...,-0.478227,1.042970,0.695211,-2.052660,0.107395,-1.556247,1.185257,0.018740,-1.315058,0.763899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9PW80,1.765709,-3.269270,-0.964447,0.027551,0.942651,0.620011,1.629530,4.563704,-3.255459,2.323085,...,-2.144457,3.306501,1.052426,1.227748,-1.187039,-2.034149,-1.309131,-0.604811,-1.172306,0.282283
Q9W1H5,1.118758,-0.587482,0.230328,0.334979,-0.331582,-0.022242,2.250997,1.375120,-0.762648,1.055281,...,0.118732,0.144925,0.908904,-1.419138,-0.539025,-2.019536,-0.685582,-0.401297,0.064833,-1.198326
Q5ISE2,-0.208676,-0.881484,0.749949,-0.604001,0.918492,-0.747613,0.890717,-0.070157,-0.812664,0.010005,...,-0.077104,1.578127,1.115955,-1.540212,-0.692428,-2.317833,0.882409,-2.047596,1.114108,-0.725852
O23164,-1.311845,1.542497,1.396345,0.729738,-0.308683,0.762246,0.606386,0.787085,-0.007480,0.159916,...,-0.765548,-0.056224,0.564101,-1.035323,-0.513762,-1.000527,0.143445,-0.702304,-0.729195,0.201860


In [89]:
lookup = pd.DataFrame({'query': test_df1.index, 'nearest': nearest_neighbors_df.index})
annots = train_terms.set_index('EntryID').loc[list(set(train_terms.EntryID) & set(lookup.nearest))]
annots = annots.reset_index()
annots = pd.merge(annots, lookup, left_on='EntryID', right_on='nearest')
annots['value'] = 1
col_lookup = pd.DataFrame({'colid': np.arange(len(set(annots.term)))}, index=annots.term.unique())
row_lookup = pd.DataFrame({'rowid': np.arange(len(nearest_neighbors_df.index))}, index=nearest_neighbors_df.index)
annots = pd.merge(annots, row_lookup, left_on='nearest', right_index=True)
annots = pd.merge(annots, col_lookup, left_on='term', right_index=True)
data = annots.value.values
i, j = annots.rowid.values, annots.colid.values
mat = scipy.sparse.coo_matrix((data, (i,j)), shape=(len(row_lookup), len(col_lookup)))

In [90]:
mat

<28096x21713 sparse matrix of type '<class 'numpy.int64'>'
	with 1675738 stored elements in COOrdinate format>

In [76]:
lookup = pd.DataFrame({'query': test_df1.index, 'nearest': nearest_neighbors_df.index})
annots = train_terms.set_index('EntryID').loc[list(set(train_terms.EntryID) & set(lookup['query']))]
annots = annots.reset_index()
annots = pd.merge(annots, lookup, left_on='EntryID', right_on='query')
annots['value'] = 1
col_lookup = pd.DataFrame({'colid': np.arange(len(set(annots.term)))}, index=annots.term.unique())
row_lookup = pd.DataFrame({'rowid': np.arange(len(set(annots['query'])))}, index=annots['query'].unique())
annots = pd.merge(annots, row_lookup, left_on='EntryID', right_index=True)
annots = pd.merge(annots, col_lookup, left_on='term', right_index=True)
data = annots.value.values
i, j = annots.rowid.values, annots.colid.values
mat2 = scipy.sparse.coo_matrix((data, (i,j)), shape=(len(row_lookup), len(col_lookup)))

In [77]:
mat2

<28096x22424 sparse matrix of type '<class 'numpy.int64'>'
	with 1053986 stored elements in COOrdinate format>

In [96]:
mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [97]:
mat2.toarray()

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [95]:
average_precision_score(mat2.toarray(), mat.toarray())

0.0018003157925305505