In [1]:
from __future__ import print_function

import numpy as np

## Select unique indices

In [2]:
allmirbase_ids_filename = "unique_allmirbase_ids.npz"
hsa_train_ids_filename = "hsa_train_ids.npz"
hsa_test_ids_filename = "hsa_test_ids.npz"

In [3]:
allmirbase_ids = np.load(allmirbase_ids_filename)['arr_0']
hsa_train_ids = np.load(hsa_train_ids_filename)['arr_0']
hsa_test_ids = np.load(hsa_test_ids_filename)['arr_0']

In [4]:
print(allmirbase_ids.shape)
print(allmirbase_ids)
print(hsa_train_ids)
print(hsa_train_ids.shape)
print(hsa_test_ids)
print(hsa_test_ids.shape)

(49602,)
[b'sma-mir-8411' b'efu-mir-9315' b'xtr-mir-133a' ... b'>hsa1_19977'
 b'>hsa1_5471' b'>hsa1_14329']
(49602,)
[b'>hsa2_22771' b'1368' b'random_seq_from_cds__NO_41742' ...
 b'>hsa2_18140' b'316' b'random_seq_from_cds__NO_58583']
(2562,)
[b'hsa-mir-592' b'hsa-mir-7114' b'358' ... b'370' b'hsa-mir-4472-2'
 b'hsa-mir-3960']
(1098,)


In [5]:
all_hsa_ids = np.concatenate((hsa_train_ids, hsa_test_ids), axis=0)
print(all_hsa_ids)
print(all_hsa_ids.shape)

[b'>hsa2_22771' b'1368' b'random_seq_from_cds__NO_41742' ... b'370'
 b'hsa-mir-4472-2' b'hsa-mir-3960']
(3660,)


In [6]:
shared_ids = np.intersect1d(allmirbase_ids, all_hsa_ids)
print(shared_ids)
print(shared_ids.shape)
shared_ids_indices = np.intersect1d(allmirbase_ids, all_hsa_ids, return_indices=True)[1]
print(shared_ids_indices)
print(shared_ids_indices.shape)

[b'100' b'1002' b'1005' ... b'random_seq_from_cds__NO_9503'
 b'random_seq_from_cds__NO_9589' b'random_seq_from_cds__NO_9774']
(3046,)
[33445 33173 32446 ... 31685 25352 25470]
(3046,)


In [7]:
selected_indices = np.setdiff1d(np.arange(len(allmirbase_ids)), shared_ids_indices)

In [8]:
print(selected_indices)
print(selected_indices.shape)

[    0     1     2 ... 49599 49600 49601]
(46556,)


## Select nonhsa data

In [9]:
allmirbase_data_filename = "unique_allmirbase_data.npz"
allmirbase_data = np.load(allmirbase_data_filename)['arr_0']

In [10]:
nonhsa_allmirbase_data = np.take(allmirbase_data, selected_indices, axis=0)
allmirbase_data = None

In [11]:
print(nonhsa_allmirbase_data)
print(nonhsa_allmirbase_data.shape)

[[[[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  ...

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]]


 [[[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]]

  [[255. 255. 255.]
   [255. 255. 255.]
   [255. 255. 255.]
   ...
   [255. 255. 255.]
 

In [12]:
np.savez_compressed("nonhsa_allmirbase_data.npz", nonhsa_allmirbase_data)

In [14]:
nonhsa_allmirbase_data = None
#allmirbase_data = None

## Select nonhsa labels

In [15]:
allmirbase_labels_filename = "unique_allmirbase_labels.npz"
allmirbase_labels = np.load(allmirbase_labels_filename)['arr_0']

In [16]:
nonhsa_allmirbase_labels = np.take(allmirbase_labels, selected_indices, axis=0)
print(nonhsa_allmirbase_labels)
print(nonhsa_allmirbase_labels.shape)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]
(46556, 1)


In [18]:
np.savez_compressed("nonhsa_allmirbase_labels.npz", nonhsa_allmirbase_labels)
allmirbase_labels = None
nonhsa_allmirbase_labels = None

## Select nonhsa ids

In [22]:
allmirbase_ids_filename = "unique_allmirbase_ids.npz"
allmirbase_ids = np.load(allmirbase_ids_filename)['arr_0']

In [23]:
nonhsa_allmirbase_ids = np.take(allmirbase_ids, selected_indices, axis=0)
print(nonhsa_allmirbase_ids)
print(nonhsa_allmirbase_ids.shape)

[b'sma-mir-8411' b'efu-mir-9315' b'xtr-mir-133a' ... b'>hsa1_19977'
 b'>hsa1_5471' b'>hsa1_14329']
(46556,)


In [24]:
np.savez_compressed("nonhsa_allmirbase_ids.npz", nonhsa_allmirbase_ids)
allmirbase_ids = None
nonhsa_allmirbase_ids = None

## Select nonhsa categories

In [26]:
allmirbase_categories_filename = "unique_allmirbase_categories.npz"
allmirbase_categories = np.load(allmirbase_categories_filename)['arr_0']

In [27]:
nonhsa_allmirbase_categories = np.take(allmirbase_categories, selected_indices, axis=0)
print(nonhsa_allmirbase_categories)
print(nonhsa_allmirbase_categories.shape)

[1 1 1 ... 3 3 3]
(46556,)


In [28]:
np.savez_compressed("nonhsa_allmirbase_categories.npz", nonhsa_allmirbase_categories)
allmirbase_categories = None
nonhsa_allmirbase_categories = None