In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from bioinf_learn.neighbors import MinHashClassifier
from bioinf_learn.neighbors import WtaHashClassifier
from bioinf_learn.util import neighborhood_accuracy

In [2]:
from eden.converter.rna.rnafold import rnafold_to_eden
from eden.converter.fasta import fasta_to_sequence
from eden.graph import Vectorizer
from itertools import islice
import numpy as np
from scipy.sparse import vstack

def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_to_matrix(rfam_id, n_max=50, complexity=2, nbits=10):
    seqs = fasta_to_sequence(rfam_uri(rfam_id))
    seqs = islice(seqs,n_max)
    seqs = list(seqs)
    graphs = rnafold_to_eden(seqs)
    vectorizer = Vectorizer(complexity=complexity, nbits=nbits, positional=True)
    X = vectorizer.transform(graphs)
    return X

def rfam_data(rfam_ids, n_max=300, complexity=3, nbits=13):
    Xs = []
    targets = []
    for i,rfam_id in enumerate(rfam_ids):
        X=rfam_to_matrix(rfam_id, n_max=n_max, complexity=complexity, nbits=nbits)
        Xs.append(X)
        targets += [i] * X.shape[0]
    data_matrix = vstack(Xs, format="csr")
    targets = np.array(targets)    
    return data_matrix, targets

In [3]:
rfam_ids=['RF00004','RF00005','RF00015','RF00020','RF00026','RF00169',
              'RF00380','RF00386','RF01051','RF01055','RF01234','RF01699',
              'RF01701','RF01705','RF01731','RF01734','RF01745','RF01750',
              'RF01942','RF01998','RF02005','RF02012','RF02034']
X, y = rfam_data(rfam_ids, n_max=50, complexity=3, nbits=16)

In [32]:
minHash = MinHashClassifier(number_of_hash_functions=100, max_bin_size= 90, shingle_size = 2, #rangeK_wta=50,
                      similarity=False, minimal_blocks_in_common=1,
                      number_of_cores=4, prune_inverse_index=14, 
                      store_value_with_least_sigificant_bit=2,
                      excess_factor=13, prune_inverse_index_after_instance=0.5, 
                      remove_hash_function_with_less_entries_as=0,
                      shingle=0, block_size=4, cpu_gpu_load_balancing = 0.0, gpu_hashing=1)

In [33]:
wtaHash = WtaHashClassifier(number_of_hash_functions=168, max_bin_size= 47, shingle_size = 2, rangeK_wta=19,
                      similarity=False, minimal_blocks_in_common=1,
                      number_of_cores=4, prune_inverse_index=2, 
                      store_value_with_least_sigificant_bit=1,
                      excess_factor=11, prune_inverse_index_after_instance=0.5, 
                      remove_hash_function_with_less_entries_as=0,
                      shingle=0, block_size=1, cpu_gpu_load_balancing = 0.0)

In [34]:
nearest_NeighborsClassifier = KNeighborsClassifier(n_jobs=4, n_neighbors=10, algorithm='brute', metric='euclidean')

In [35]:
minHash.fit(X, y)
wtaHash.fit(X, y)


In [36]:
nearest_NeighborsClassifier.fit(X, y)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=4, n_neighbors=10, p=2,
           weights='uniform')

In [37]:
kneighbors_minHash_fast = minHash.kneighbors(X, n_neighbors=10,fast=True,return_distance=False)
kneighbors_minHash = minHash.kneighbors(X, n_neighbors=10,fast=False,return_distance=False)
kneighbors_wtaHash_fast = wtaHash.kneighbors(X, n_neighbors=10,fast=True,return_distance=False)
kneighbors_wtaHash = wtaHash.kneighbors(X, n_neighbors=10,fast=False,return_distance=False)


In [38]:
sklearn_kneighbors = nearest_NeighborsClassifier.kneighbors(X, n_neighbors=10, return_distance=False)

In [11]:
print sklearn_kneighbors

[[   0    2   14 ...,   49    1   34]
 [   1    4   34 ...,   16   43    5]
 [   2    0 1047 ...,  821   45  809]
 ..., 
 [1147 1146 1109 ..., 1107 1135 1104]
 [1148 1104 1116 ..., 1109 1146 1105]
 [1149 1117 1146 ..., 1135 1134 1144]]


In [39]:
print "Accuracy MinHash fast: ", neighborhood_accuracy(kneighbors_minHash_fast, sklearn_kneighbors)
print "Accuracy MinHash: ", neighborhood_accuracy(kneighbors_minHash, sklearn_kneighbors)
print "Accuracy WTA-Hash fast:", neighborhood_accuracy(kneighbors_wtaHash_fast, sklearn_kneighbors)
print "Accuracy WTA-Hash:", neighborhood_accuracy(kneighbors_wtaHash, sklearn_kneighbors)

Accuracy MinHash fast:  0.200608695652
Accuracy MinHash:  0.844434782609
Accuracy WTA-Hash fast: 0.262608695652
Accuracy WTA-Hash: 0.804086956522


In [40]:
predict_minHash_fast = minHash.predict(X, n_neighbors=10,fast=True)
predict_minHash = minHash.predict(X, n_neighbors=10,fast=False)
predict_wtaHash_fast = wtaHash.predict(X, n_neighbors=10,fast=True)
predict_wtaHash = wtaHash.predict(X, n_neighbors=10,fast=False)


In [28]:
predict_sklearn = nearest_NeighborsClassifier.predict(X)


In [16]:
predict_proba_minHash_fast = minHash.predict_proba(X, n_neighbors=10,fast=True)
predict__proba_minHash = minHash.predict_proba(X, n_neighbors=10,fast=False)
predict_proba_wtaHash_fast = wtaHash.predict_proba(X, n_neighbors=10,fast=True)
predict_proba_wtaHash = wtaHash.predict_proba(X, n_neighbors=10,fast=False)


In [17]:
predict_proba_sklearn = nearest_NeighborsClassifier.predict_proba(X)

In [19]:
accuracy_score_ = 0.0
for x, z in zip(predict_proba_minHash_fast, predict_proba_sklearn):
#     print x
#     print z*10
    accuracy_score_ += accuracy_score(x*10, z*10)
accuracy_score_ = accuracy_score_ / len(predict__proba_minHash)

print "Prediction proba accuracy MinHash Fast", accuracy_score_
accuracy_score_ = 0.0
for x, z in zip(predict__proba_minHash, predict_proba_sklearn):
    accuracy_score_ += accuracy_score(x*10, z*10)
accuracy_score_ = accuracy_score_ / len(predict__proba_minHash)

print "Prediction proba accuracy MinHash", accuracy_score_       
accuracy_score_ = 0.0
for x, z in zip(predict_proba_wtaHash_fast, predict_proba_sklearn):
    accuracy_score_ += accuracy_score(x*10, z*10)
accuracy_score_ = accuracy_score_ / len(predict__proba_minHash)

print "Prediction proba accuracy WTA-Hash Fast", accuracy_score_
accuracy_score_ = 0.0
for x, z in zip(predict_proba_wtaHash, predict_proba_sklearn):
    accuracy_score_ += accuracy_score(x*10, z*10)
accuracy_score_ = accuracy_score_ / len(predict__proba_minHash)

print "Prediction proba accuracy WTA-Hash",accuracy_score_
 

Prediction proba accuracy MinHash Fast 0.719130434783
Prediction proba accuracy MinHash 0.939470699433
Prediction proba accuracy WTA-Hash Fast 0.740869565217
Prediction proba accuracy WTA-Hash 0.927145557656


In [45]:
print minHash.score(X, y, fast=True)
print minHash.score(X, y, fast=False)
print wtaHash.score(X, y, fast=True)
print wtaHash.score(X, y, fast=False)

0.411304347826
0.573913043478
0.479130434783
0.594782608696


In [23]:
print nearest_NeighborsClassifier.score(X,y)

0.568695652174
