# SpectralClustering and DBSCAN with minHash

In [1]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from scipy.sparse import csr_matrix
from scipy import stats
import hashlib
from random import randint
import numpy as np

from eden.converter.graph.gspan import gspan_to_eden
from eden.graph import Vectorizer

In [2]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGU', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2):
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
        
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences / n_motives

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            seqs.append(('>ID%d'%counter,seq))
            counter += 1
    return motives, seqs

In [3]:
#setup parameters
alphabet='ACGU'
motives=['AAAAAAAAAA','CCCCCCCCCC','GGGGGGGGGG','UUUUUUUUUU']
sequence_length=100
n_sequences=100
p=0.3

#make dataset
motives_2, seqs = make_artificial_dataset(alphabet=alphabet,motives=motives,
                                        sequence_length=sequence_length,n_sequences=n_sequences,p=p)

In [13]:
seqs_list = []
for motif in seqs:
    seqs_list.append(motif[1])

In [11]:
vectorizer = Vectorizer(complexity=3, nbits=20)
dataset_sparse = vectorizer.transform(seqs_list)

AttributeError: 'str' object has no attribute 'number_of_nodes'

In [10]:
%matplotlib inline
%load_ext memory_profiler
print(__doc__)

import time

import numpy as np

from sklearn.metrics import accuracy_score

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph

from neighborsMinHash.clustering import MinHashClustering 
from neighborsMinHash import MinHash
from memory_profiler import memory_usage


clustering_names = [
    'SpectralClustering', 'MinHashSpectralClustering', 'DBSCAN', 'MinHashDBSCAN']

# original algorithms
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="nearest_neighbors", n_neighbors=5)
dbscan = cluster.DBSCAN(eps=.2, metric="euclidean")

# objects used for algorithms with precomputed minHash
minHash0 = MinHash(n_neighbors=5)
minHash1 = MinHash(n_neighbors=5)

spectral_precomputed = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="precomputed", n_neighbors=5)

dbscan_precomputed = cluster.DBSCAN(eps=.2, metric='precomputed')


minHashClusteringSpectralClustering = MinHashClustering(minHash0, spectral_precomputed)
minHashClusteringDBSCAN = MinHashClustering(minHash1, dbscan_precomputed)

clustering_algorithms=[spectral, minHashClusteringSpectralClustering, dbscan, minHashClusteringDBSCAN]
    
X = dataset_sparse
for name, algorithm in zip(clustering_names, clustering_algorithms):
    print "\n"
    t0 = time.time()
    %time %memit y_pred = algorithm.fit_predict(X)
    t1 = time.time()
    
    y_pred = y_pred.astype(np.int)
    print name, ":\tAccuracy: ", float("{0:.2f}".format(adjusted_rand_score(y_true, y_pred)))

Automatically created module for IPython interactive environment


NameError: name 'predictions' is not defined