In [1]:
import keras
from keras import backend as K
from keras.models import load_model
import numpy as np
import pandas as pd
import deepmirna_utils as deep_utils

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
## util functions
def load_names(filename):
     return np.load(filename)['arr_0']


def load_image_data(filename):
     X = np.load(filename)['arr_0']
    
     if K.image_data_format() == 'channels_first':
          X = np.swapaxes(X, 1, 3)

     X = X.astype('float32')
     if np.amax(X) > 1:
          X /= 255
          
     return X


def load_labels(filename):
    labels = np.load(filename)['arr_0']
    return keras.utils.to_categorical(labels, 2)


In [3]:
mirbase_names_filename = "../datasets/benchmark/mirbase_names.npz"
images_filename = "../datasets/benchmark/mirbase_images.npz"
labels_filename = "../datasets/benchmark/mirbase_labels.npz"

In [4]:
mirbase_names = np.load(mirbase_names_filename)['arr_0'].tolist()
mirbase_names = [name for name in mirbase_names]
labels = load_labels(labels_filename)
categorical_labels = np.argmax(labels, axis=1)
images = load_image_data(images_filename)

In [5]:
def extract_prefixes(names):
    prefixes = {}
    
    for name in names:
        prefix = name.lower().split("-")[0]
        if prefix in prefixes:
            prefixes[prefix] += 1
        else:
            prefixes[prefix] = 1
    
    return prefixes

In [6]:
allprefixes = extract_prefixes(mirbase_names)
species200 = {prefix:allprefixes[prefix] for prefix in allprefixes.keys() if allprefixes[prefix] >= 200}

In [7]:
print("number of species: {}".format(len(allprefixes)))
print("number of species with at least 200 entries: {}".format(len(species200)))

number of species: 223
number of species with at least 200 entries: 45


In [8]:
for sorted_tupples in sorted(species200.items(), key=lambda x: x[1], reverse=True): 
    print("{}: {}".format(sorted_tupples[0], sorted_tupples[1]))

hsa: 1881
mmu: 1193
bta: 808
gga: 740
eca: 715
mtr: 670
ptr: 655
ppy: 642
mml: 619
osa: 592
gma: 573
efu: 502
rno: 495
cfa: 495
bmo: 487
mdo: 460
oan: 396
ssc: 382
ssa: 371
ppc: 354
ptc: 352
ggo: 352
cin: 348
dre: 346
ath: 324
bdi: 317
aca: 282
ipu: 281
gra: 269
chi: 267
dme: 256
ame: 254
cel: 250
tgu: 247
pma: 244
ppt: 229
stu: 224
tca: 220
cbn: 214
dps: 210
mdm: 206
aly: 205
sbi: 205
prd: 200
cgr: 200


# Predictions

In [9]:
def performance_per_species(predictions, names, prefix_count):
    correct_predictions = {}
    for prefix in prefix_count:
        correct_predictions[prefix] = 0
    
    for prediction, name in zip(predictions, names):
        prefix = name.lower().split("-")[0]
        if prefix in prefix_count:
            if prediction == 1:
                correct_predictions[prefix] += 1
    
    for prefix in correct_predictions:
        correct_predictions[prefix] = (1.0 * correct_predictions[prefix]) / prefix_count[prefix] 
        
    return correct_predictions

In [10]:
model = load_model("../models/fine_tuned_cnn.h5")





In [11]:
predictions = model.predict(images)
categorical_predictions = np.argmax(predictions, axis=1)

In [12]:
np.sum(categorical_predictions)

26660

In [13]:
dl_acc_species200_dict = performance_per_species(categorical_predictions,
                                                 mirbase_names, 
                                                 species200)

In [14]:
species200_dlpredictions_tmp = [(key, dl_acc_species200_dict[key]) for key in dl_acc_species200_dict]

In [15]:
species200_dlpredictions_tmp

[('ptc', 0.9460227272727273),
 ('ssc', 0.8743455497382199),
 ('ppt', 0.9388646288209607),
 ('chi', 0.9138576779026217),
 ('bta', 0.9257425742574258),
 ('ppy', 0.9299065420560748),
 ('cbn', 1.0),
 ('hsa', 0.9835194045720361),
 ('dme', 0.93359375),
 ('mdo', 0.9847826086956522),
 ('ame', 0.7755905511811023),
 ('rno', 0.9696969696969697),
 ('prd', 0.99),
 ('eca', 0.9188811188811189),
 ('osa', 0.8918918918918919),
 ('bmo', 0.6735112936344969),
 ('gma', 0.93717277486911),
 ('ppc', 0.9745762711864406),
 ('tgu', 0.854251012145749),
 ('cgr', 0.93),
 ('sbi', 0.9609756097560975),
 ('gra', 0.8996282527881041),
 ('stu', 0.9910714285714286),
 ('mtr', 0.8492537313432836),
 ('tca', 0.8409090909090909),
 ('dre', 0.9624277456647399),
 ('mmu', 0.9329421626152556),
 ('pma', 0.9262295081967213),
 ('ath', 0.9351851851851852),
 ('aly', 0.975609756097561),
 ('ptr', 0.9541984732824428),
 ('cfa', 0.9434343434343434),
 ('ggo', 0.9034090909090909),
 ('ipu', 0.9822064056939501),
 ('oan', 0.9343434343434344),
 ('dp

In [16]:
species200_dlpredictions = pd.DataFrame.from_dict(species200_dlpredictions_tmp)

In [17]:
species200_dlpredictions.columns = ['Acronym', 'fine-tuned-CNN']
species200_dlpredictions

Unnamed: 0,Acronym,fine-tuned-CNN
0,ptc,0.946023
1,ssc,0.874346
2,ppt,0.938865
3,chi,0.913858
4,bta,0.925743
5,ppy,0.929907
6,cbn,1.0
7,hsa,0.983519
8,dme,0.933594
9,mdo,0.984783


In [18]:
species200_dlpredictions= species200_dlpredictions.sort_values(by=['Acronym'])
species200_dlpredictions

Unnamed: 0,Acronym,fine-tuned-CNN
39,aca,0.992908
29,aly,0.97561
10,ame,0.775591
28,ath,0.935185
37,bdi,0.908517
15,bmo,0.673511
4,bta,0.925743
6,cbn,1.0
36,cel,0.928
31,cfa,0.943434


## Load ML predictions

In [19]:
mlpredictions_filename = "MiRBasepredictions.xlsx"
mlpredictions_df = pd.read_excel(mlpredictions_filename)
mlpredictions_df

Unnamed: 0,Acronym,Genus,Species,HairpinNumber,ConsensusRule,ConsensusDT,ConsensusNB,ConsensusModel,AverageDT,AverageNB,...,BurgtNB,BentwichNB,BatuwitaNB,ChenNB,XuNB,XueNB,NgNB,GaoNB,DingNB,LopesNB
0,hsa,Homo,sapiens,1881,76.40,98.56,86.12,97.13,97.08,82.62,...,80.49,91.87,85.96,77.78,71.35,65.39,86.02,77.03,88.20,85.81
1,mmu,Mus,musculus,1193,59.09,86.59,82.23,76.36,82.82,77.03,...,73.34,92.04,78.63,76.19,62.28,56.50,82.90,70.91,83.57,79.13
2,mmu*,Mus,musculus,380,83.95,95.79,93.42,94.21,95.00,91.05,...,90.26,97.63,92.11,88.95,80.53,74.74,95.00,85.00,94.21,92.63
3,bta,Bos,taurus,808,65.72,84.41,80.45,78.22,81.68,77.60,...,79.46,87.50,80.69,76.36,66.83,61.26,83.29,71.78,82.43,79.95
4,gga,Gallus,gallus,740,54.86,83.92,77.43,73.78,79.59,72.84,...,72.30,88.65,77.70,72.30,54.19,47.43,81.49,65.27,82.70,77.97
5,eca,Equus,caballus,715,56.36,92.45,83.50,77.34,87.27,80.14,...,73.85,91.19,81.82,73.29,53.43,47.13,78.32,80.14,85.31,85.87
6,mtr,Medicago,truncatula,670,73.73,90.45,84.48,82.54,86.27,82.99,...,80.15,96.87,80.30,94.03,67.61,67.76,88.96,81.94,85.07,77.16
7,ptr,Pan,troglodytes,655,78.32,94.81,88.24,90.23,92.67,86.72,...,84.89,93.44,86.87,85.50,72.52,67.79,89.31,80.31,89.01,88.40
8,ppy,Pongo,pygmaeus,642,76.95,91.90,86.60,88.94,89.56,84.58,...,82.71,91.90,85.67,82.40,71.81,66.36,87.69,78.50,88.63,86.76
9,mml,Macaca,mulatta,619,80.29,94.83,88.85,91.76,91.76,86.75,...,84.81,92.73,88.05,83.20,75.28,69.79,89.98,80.29,90.95,89.18


In [20]:
mlpredictions_df.sort_values(by=['HairpinNumber'], ascending=False)
species200_mlpredictions = mlpredictions_df[mlpredictions_df['HairpinNumber'] >= 200]
species200_mlpredictions

Unnamed: 0,Acronym,Genus,Species,HairpinNumber,ConsensusRule,ConsensusDT,ConsensusNB,ConsensusModel,AverageDT,AverageNB,...,BurgtNB,BentwichNB,BatuwitaNB,ChenNB,XuNB,XueNB,NgNB,GaoNB,DingNB,LopesNB
0,hsa,Homo,sapiens,1881,76.4,98.56,86.12,97.13,97.08,82.62,...,80.49,91.87,85.96,77.78,71.35,65.39,86.02,77.03,88.2,85.81
1,mmu,Mus,musculus,1193,59.09,86.59,82.23,76.36,82.82,77.03,...,73.34,92.04,78.63,76.19,62.28,56.5,82.9,70.91,83.57,79.13
2,mmu*,Mus,musculus,380,83.95,95.79,93.42,94.21,95.0,91.05,...,90.26,97.63,92.11,88.95,80.53,74.74,95.0,85.0,94.21,92.63
3,bta,Bos,taurus,808,65.72,84.41,80.45,78.22,81.68,77.6,...,79.46,87.5,80.69,76.36,66.83,61.26,83.29,71.78,82.43,79.95
4,gga,Gallus,gallus,740,54.86,83.92,77.43,73.78,79.59,72.84,...,72.3,88.65,77.7,72.3,54.19,47.43,81.49,65.27,82.7,77.97
5,eca,Equus,caballus,715,56.36,92.45,83.5,77.34,87.27,80.14,...,73.85,91.19,81.82,73.29,53.43,47.13,78.32,80.14,85.31,85.87
6,mtr,Medicago,truncatula,670,73.73,90.45,84.48,82.54,86.27,82.99,...,80.15,96.87,80.3,94.03,67.61,67.76,88.96,81.94,85.07,77.16
7,ptr,Pan,troglodytes,655,78.32,94.81,88.24,90.23,92.67,86.72,...,84.89,93.44,86.87,85.5,72.52,67.79,89.31,80.31,89.01,88.4
8,ppy,Pongo,pygmaeus,642,76.95,91.9,86.6,88.94,89.56,84.58,...,82.71,91.9,85.67,82.4,71.81,66.36,87.69,78.5,88.63,86.76
9,mml,Macaca,mulatta,619,80.29,94.83,88.85,91.76,91.76,86.75,...,84.81,92.73,88.05,83.2,75.28,69.79,89.98,80.29,90.95,89.18


In [21]:
species200_mlpredictions.columns.values

array(['Acronym', 'Genus', 'Species', 'HairpinNumber', 'ConsensusRule',
       'ConsensusDT', 'ConsensusNB', 'ConsensusModel', 'AverageDT',
       'AverageNB', 'GudysDT', 'JiangDT', 'RitchieDT', 'BurgtDT',
       'BentwichDT', 'BatuwitaDT', 'ChenDT', 'XuDT', 'XueDT', 'NgDT',
       'GaoDT', 'DingDT', 'LopesDT', 'GudysNB', 'JiangNB', 'RitchieNB',
       'BurgtNB', 'BentwichNB', 'BatuwitaNB', 'ChenNB', 'XuNB', 'XueNB',
       'NgNB', 'GaoNB', 'DingNB', 'LopesNB'], dtype=object)

In [22]:
selected_columns = ['Acronym', 'AverageDT', 'ConsensusNB']
selected_mlpredictions = species200_mlpredictions[selected_columns]
selected_mlpredictions = selected_mlpredictions.sort_values(by=['Acronym'])
selected_mlpredictions['AverageDT'] = selected_mlpredictions['AverageDT'] / 100.0
selected_mlpredictions['ConsensusNB'] = selected_mlpredictions['ConsensusNB'] / 100.0

## Merge and store ML and DL predictions

In [23]:

species200_performance = pd.merge(species200_dlpredictions,
                                  selected_mlpredictions, 
                                  on='Acronym', how='inner')

In [24]:
sorted_species200_performance = species200_performance.sort_values(by=['fine-tuned-CNN'], 
                                                                   ascending=False)

In [25]:
sorted_species200_performance.to_csv("species200_models_performance.csv", index=False)