In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

from IPython.display import Audio, display
def allDone():
    urL = 'http://www.wavsource.com/snds_2020-03-30_7102365145747638/animals/cat_meow2.wav'
    display(Audio(url=urL, autoplay=True))
# allDone()

random.seed(666)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
motifs = np.genfromtxt('data__for_nerds/motifs.csv',dtype='U')
motifxFamMatrix = np.genfromtxt('data__for_nerds/motifxFamMatrix.csv',delimiter=',',dtype=int)
fams = np.genfromtxt('data__for_nerds/fams.csv',dtype='U')

print(len(motifs))

7866


In [37]:
X_train, X_test = train_test_split(range(len(motifs)), test_size=0.15, random_state=666)
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=666)

print(len(X_train), len(X_val), len(X_test))

6686 1338 1180


In [50]:
# embedding = np.genfromtxt('data__for_nerds/protvec_embedding.csv',delimiter=',',dtype=float)
embedding = np.genfromtxt('emb_05_embedding.csv',delimiter=',',dtype=float)

print(embedding.shape)

(7866, 1300)


In [51]:
#### SILHOUETTE SCORES #####

allSilhouettes = []
for i,fam in enumerate(fams):
    silhouette = silhouette_score(embedding[X_test], motifxFamMatrix[X_test].T[i], metric='euclidean', sample_size=None, random_state=None)
    allSilhouettes.append(silhouette)
allSilhouettes_array = np.array((allSilhouettes))

# print( np.mean(allSilouettes_array), np.median(allSilouettes_array),
#        fams,[round(x,6) for x in allSilouettes] )

print("mean silhouette:", np.mean(allSilhouettes_array).round(3))
print("median silhouette", np.median(allSilhouettes_array).round(3))

mean silhouette: 0.112
median silhouette 0.06


In [52]:
def getKnnResults(fIdx, myAverage):
    
    ## best k is num of instances ^ 1/2
    ## https://stackoverflow.com/questions/18110951/how-to-determine-k-value-for-the-k-nearest-neighbours-algorithm-for-a-matrix-in
    k = int(len(embedding)**(1/2))
    
    X_tr = embedding[X_train] 
    X_te = embedding[X_test]
    y_tr = motifxFamMatrix[X_train].T[fIdx]
    y_te = motifxFamMatrix[X_test].T[fIdx]
    
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(X_tr, y_tr)

    y_pred = kNN.predict(X_te)
    y_score = kNN.predict_proba(X_te)

    precision, recall, fscore, support = score(y_te,y_pred,average=myAverage)
    return (precision, recall, fscore, support, y_score)

In [53]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support as score

resPrecision = []
resRecall = []
thisY_score = [] # for only one file !!!!
myResPrecision = []
myResRecall = []

for fIdx,fam in enumerate(fams):
    precision, recall, fscore, support, myY_score = (getKnnResults(fIdx, "macro"))
    thisY_score.append(myY_score)
    myResPrecision.append(precision)
    myResRecall.append(recall)

In [54]:
thisY_score = np.array((thisY_score))
thisY_score.shape

y_score = thisY_score[:,:,1].T
y_test = motifxFamMatrix[X_test]

In [55]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
import numpy as np


print("AUROC results:")
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(fams)):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    print(fams[i],roc_auc[i].round(3))


fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(fams))]))

mean_tpr = np.zeros_like(all_fpr)
for i in range(len(fams)):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= len(fams)

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print("macro auroc:",roc_auc["macro"].round(3),
      "\nmicro auroc:",roc_auc["micro"].round(3))

AUROC results:
PKA 0.837
AKT 0.859
CDK 0.861
MAPK 0.881
SRC 1.0
CK2 0.917
PKC 0.839
PIKK 0.804
macro auroc: 0.875 
micro auroc: 0.909


In [56]:
from itertools import chain
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

precision = dict()
recall = dict()
average_precision = dict()

print("AVERAGE Precision (AP) results")
for i in range(len(fams)):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],y_score[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
    print(fams[i],average_precision[i].round(3))


average_precision["micro"] = average_precision_score(y_test, y_score,average="micro")
average_precision["macro"] = average_precision_score(y_test, y_score, average="macro")
print("avg AP (macro):",average_precision["macro"].round(3),
      "\navg AP (micro):",average_precision["micro"].round(3))

AVERAGE Precision (AP) results
PKA 0.507
AKT 0.367
CDK 0.401
MAPK 0.55
SRC 1.0
CK2 0.679
PKC 0.54
PIKK 0.328
avg AP (macro): 0.547 
avg AP (micro): 0.647
