In [14]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

from IPython.display import Audio, display
def allDone():
    urL = 'http://www.wavsource.com/snds_2020-03-30_7102365145747638/animals/cat_meow2.wav'
    display(Audio(url=urL, autoplay=True))
# allDone()

random.seed(666)

In [15]:
train_motifs = np.genfromtxt('data__for_nerds/train_motifs.csv',dtype='U')
train_motifxFamMatrix = np.genfromtxt('data__for_nerds/train_motifxFamMatrix.csv',delimiter=',',dtype=int)
test_motifs = np.genfromtxt('data__for_nerds/new_test_motifs.csv',dtype='U')
test_motifxFamMatrix = np.genfromtxt('data__for_nerds/new_test_motifxFamMatrix.csv',delimiter=',',dtype=int)

fams = np.genfromtxt('data__for_nerds/fams.csv',dtype='U')

In [16]:
from sklearn.model_selection import train_test_split

all_motifs = np.hstack([train_motifs,test_motifs])
all_motifxFamMatrix = np.vstack([train_motifxFamMatrix,test_motifxFamMatrix])

print(train_motifxFamMatrix.shape)
print(test_motifxFamMatrix.shape)      

(6415, 8)
(1179, 8)


In [157]:
# embedding = np.genfromtxt('data__for_nerds/protvec_embedding_avg.csv',delimiter=',',dtype=float)
embedding = np.genfromtxt('MODELS_siamese/emb_37_embedding.csv',delimiter=',',dtype=float)

train_embedding = embedding[ :len(train_motifs) ]
test_embedding = embedding[ len(train_motifs): ]

print(train_embedding.shape)
print(test_embedding.shape)

(6415, 330)
(1179, 330)


In [158]:
#### SILHOUETTE SCORES #####

allSilhouettes = []
for i,fam in enumerate(fams):
    silhouette = silhouette_score(test_embedding, test_motifxFamMatrix.T[i], 
                                  metric='euclidean', sample_size=None, random_state=None)
    allSilhouettes.append(silhouette)
allSilhouettes_array = np.array((allSilhouettes))

# print( np.mean(allSilouettes_array), np.median(allSilouettes_array),
#        fams,[round(x,6) for x in allSilouettes] )

print("mean silhouette:", np.mean(allSilhouettes_array).round(3))
print("median silhouette", np.median(allSilhouettes_array).round(3))

mean silhouette: 0.0
median silhouette 0.0


In [152]:
def getKnnResults(fIdx, myAverage):
    
    ## best k is num of instances ^ 1/2
    ## https://stackoverflow.com/questions/18110951/how-to-determine-k-value-for-the-k-nearest-neighbours-algorithm-for-a-matrix-in
    k = int(len(embedding)**(1/2))
    
    X_tr = train_embedding 
    X_te = test_embedding
    y_tr = train_motifxFamMatrix.T[fIdx]
    y_te = test_motifxFamMatrix.T[fIdx]
    
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(X_tr, y_tr)

    y_pred = kNN.predict(X_te)
    y_score = kNN.predict_proba(X_te)

    precision, recall, fscore, support = score(y_te,y_pred,average=myAverage)
    return (precision, recall, fscore, support, y_score)

In [153]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support as score

thisY_score = [] 
for fIdx,fam in enumerate(fams):
    precision, recall, fscore, support, myY_score = (getKnnResults(fIdx, "macro"))
    thisY_score.append(myY_score)

In [154]:
thisY_score = np.array((thisY_score))
thisY_score.shape

y_score = thisY_score[:,:,1].T
y_test = test_motifxFamMatrix

In [155]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
import numpy as np


print("AUROC results:")
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(fams)):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    print(fams[i],roc_auc[i].round(3))


fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(fams))]))

mean_tpr = np.zeros_like(all_fpr)
for i in range(len(fams)):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= len(fams)

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print("macro auroc:",roc_auc["macro"].round(3),
      "\nmicro auroc:",roc_auc["micro"].round(3))

AUROC results:
PKA 0.844
AKT 0.876
CDK 0.882
MAPK 0.88
SRC 1.0
CK2 0.914
PKC 0.875
PIKK 0.858
macro auroc: 0.891 
micro auroc: 0.917


In [156]:
from itertools import chain
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

precision = dict()
recall = dict()
average_precision = dict()

print("AVERAGE Precision (AP) results")
for i in range(len(fams)):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],y_score[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
    print(fams[i],average_precision[i].round(3))


average_precision["micro"] = average_precision_score(y_test, y_score,average="micro")
average_precision["macro"] = average_precision_score(y_test, y_score, average="macro")
print("avg AP (macro):",average_precision["macro"].round(3),
      "\navg AP (micro):",average_precision["micro"].round(3))

AVERAGE Precision (AP) results
PKA 0.592
AKT 0.419
CDK 0.51
MAPK 0.569
SRC 1.0
CK2 0.747
PKC 0.638
PIKK 0.617
avg AP (macro): 0.636 
avg AP (micro): 0.711
