In [1]:
%matplotlib inline
%load_ext memory_profiler
from sklearn.neighbors import NearestNeighbors
from sklearn.random_projection import SparseRandomProjection
from bioinf_learn.util import neighborhood_accuracy
from eden.converter.graph.gspan import gspan_to_eden
from eden.graph import Vectorizer

In [2]:
graphs = gspan_to_eden( 'http://www.bioinf.uni-freiburg.de/~costa/bursi.gspan' )
vectorizer = Vectorizer( r=2,d=5 )
%time %memit datasetBursi = vectorizer.transform( graphs )

peak memory: 482.49 MiB, increment: 388.84 MiB
CPU times: user 35.4 s, sys: 220 ms, total: 35.6 s
Wall time: 36.4 s


In [3]:
print "Shape: ", datasetBursi.shape
print "Approximate number of non-zero features: ", datasetBursi.nnz / float(datasetBursi.shape[0])
print "Sparsity: ", datasetBursi.nnz / float(datasetBursi.shape[0]) / datasetBursi.shape[1]

Shape:  (4337, 1048577)
Approximate number of non-zero features:  373.168549689
Sparsity:  0.000355880922134


In [7]:
data_projection = SparseRandomProjection(n_components=10000, random_state=1)
dataset_dense_10000 = data_projection.fit_transform(datasetBursi)

data_projection = SparseRandomProjection(n_components=1000, random_state=1)
dataset_dense_1000 = data_projection.fit_transform(datasetBursi)

data_projection = SparseRandomProjection(n_components=800, random_state=1)
dataset_dense_800 = data_projection.fit_transform(datasetBursi)

data_projection = SparseRandomProjection(n_components=600, random_state=1)
dataset_dense_600 = data_projection.fit_transform(datasetBursi)

data_projection = SparseRandomProjection(n_components=400, random_state=1)
dataset_dense_400 = data_projection.fit_transform(datasetBursi)

data_projection = SparseRandomProjection(n_components=100, random_state=1)
dataset_dense_100 = data_projection.fit_transform(datasetBursi)


In [13]:
print "Projeted to 10000 dimensions:\n"
print "Shape: ", dataset_dense_10000.shape
print "Approximate number of non-zero features: ", dataset_dense_10000.nnz / float(dataset_dense_10000.shape[0])
print "Sparsity: ", dataset_dense_10000.nnz / float(dataset_dense_10000.shape[0]) / dataset_dense_10000.shape[1]
print "\n"

print "Projeted to 1000 dimensions:\n"
print "Shape: ", dataset_dense_1000.shape
print "Approximate number of non-zero features: ", dataset_dense_1000.nnz / float(dataset_dense_1000.shape[0])
print "Sparsity: ", dataset_dense_1000.nnz / float(dataset_dense_1000.shape[0]) / dataset_dense_1000.shape[1]
print "\n"

print "Projeted to 800 dimensions:\n"
print "Shape: ", dataset_dense_800.shape
print "Approximate number of non-zero features: ", dataset_dense_800.nnz / float(dataset_dense_800.shape[0])
print "Sparsity: ", dataset_dense_800.nnz / float(dataset_dense_800.shape[0]) / dataset_dense_800.shape[1]
print "\n"

print "Projeted to 600 dimensions:\n"
print "Shape: ", dataset_dense_600.shape
print "Approximate number of non-zero features: ", dataset_dense_600.nnz / float(dataset_dense_600.shape[0])
print "Sparsity: ", dataset_dense_600.nnz / float(dataset_dense_600.shape[0]) / dataset_dense_600.shape[1]
print "\n"

print "Projeted to 400 dimensions:\n"
print "Shape: ", dataset_dense_400.shape
print "Approximate number of non-zero features: ", dataset_dense_400.nnz / float(dataset_dense_400.shape[0])
print "Sparsity: ", dataset_dense_400.nnz / float(dataset_dense_400.shape[0]) / dataset_dense_400.shape[1]
print "\n"

print "Projeted to 100 dimensions:\n"
print "Shape: ", dataset_dense_100.shape
print "Approximate number of non-zero features: ", dataset_dense_100.nnz / float(dataset_dense_100.shape[0])
print "Sparsity: ", dataset_dense_100.nnz / float(dataset_dense_100.shape[0]) / dataset_dense_100.shape[1]

Projeted to 10000 dimensions:

Shape:  (4337, 10000)
Approximate number of non-zero features:  2913.90246714
Sparsity:  0.291390246714


Projeted to 1000 dimensions:

Shape:  (4337, 1000)
Approximate number of non-zero features:  286.160018446
Sparsity:  0.286160018446


Projeted to 800 dimensions:

Shape:  (4337, 800)
Approximate number of non-zero features:  229.502190454
Sparsity:  0.286877738068


Projeted to 600 dimensions:

Shape:  (4337, 600)
Approximate number of non-zero features:  175.481438783
Sparsity:  0.292469064638


Projeted to 400 dimensions:

Shape:  (4337, 400)
Approximate number of non-zero features:  119.386903389
Sparsity:  0.298467258474


Projeted to 100 dimensions:

Shape:  (4337, 100)
Approximate number of non-zero features:  29.6795019599
Sparsity:  0.296795019599


In [5]:
exact = NearestNeighbors()
exact.fit(datasetBursi)
%time %memit exactNeighbors = exact.kneighbors(return_distance=False)

peak memory: 857.61 MiB, increment: 436.17 MiB
CPU times: user 4.45 s, sys: 148 ms, total: 4.6 s
Wall time: 4.71 s


In [6]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_10000)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 10000 dimensions: ", neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 955.72 MiB, increment: 390.56 MiB
CPU times: user 1min 6s, sys: 152 ms, total: 1min 6s
Wall time: 1min 6s
Accuracy with projection to 10000 dimensions:  0.903020521098


In [8]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_1000)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 1000 dimensions: ",neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 852.28 MiB, increment: 419.16 MiB
CPU times: user 6.85 s, sys: 88 ms, total: 6.94 s
Wall time: 7.05 s
Accuracy with projection to 1000 dimensions:  0.753792944432


In [9]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_800)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 800 dimensions: ",neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 866.98 MiB, increment: 433.84 MiB
CPU times: user 5.6 s, sys: 120 ms, total: 5.72 s
Wall time: 5.83 s
Accuracy with projection to 800 dimensions:  0.731657827992


In [10]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_600)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 600 dimensions: ",neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 800.81 MiB, increment: 367.67 MiB
CPU times: user 4.56 s, sys: 84 ms, total: 4.65 s
Wall time: 4.76 s
Accuracy with projection to 600 dimensions:  0.697855660595


In [11]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_400)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 400 dimensions: ",neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 834.54 MiB, increment: 401.39 MiB
CPU times: user 3.38 s, sys: 116 ms, total: 3.49 s
Wall time: 3.61 s
Accuracy with projection to 400 dimensions:  0.646253170394


In [12]:
randomProjection = NearestNeighbors()
randomProjection.fit(dataset_dense_100)
%time %memit randomNeighbors = randomProjection.kneighbors(return_distance=False)

print "Accuracy with projection to 100 dimensions: ", neighborhood_accuracy(exactNeighbors, randomNeighbors)

peak memory: 855.54 MiB, increment: 422.39 MiB
CPU times: user 1.58 s, sys: 72 ms, total: 1.65 s
Wall time: 1.76 s
Accuracy with projection to 100 dimensions:  0.438044731381
