## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [2]:
import numpy as np
import os
import struct
import pandas as pd
import time
from time import process_time
from itertools import chain 
import gc

In [3]:
dataset = 'sift'

if dataset == 'sift':
  url = "ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz"
  paths =  '/tmp/sift.tar.gz' 
  trainPath = 'sift/sift_base.fvecs'
  queryPath = 'sift/sift_query.fvecs'
  groundPath = 'sift/sift_groundtruth.ivecs'
  
if dataset == 'smallSift':
  url = 'ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz'
  paths =  '/tmp/siftsmall.tar.gz' 
  trainPath = 'siftsmall/siftsmall_base.fvecs'
  queryPath = 'siftsmall/siftsmall_query.fvecs'
  groundPath = 'siftsmall/siftsmall_groundtruth.ivecs'
  

In [4]:
import urllib 
urllib.request.urlretrieve(url, paths)

In [5]:
def get_zippedFvecs(pathToGz,memeber):
    import tarfile
    fn = pathToGz
    import struct
    import numpy as np
    t = tarfile.open(fn, 'r:gz') 
    
    
    m = t.getmember(memeber)
    file = t.extractfile(m)
    fileSize = m.size
    #file =  open(path,'rb')
    #first 4 bytes of every vector indicate number od dimensions 
    numOfDimensions = struct.unpack('i', file.read(4))[0]
    #each vector has 4 bytes (float is 32 bits) * numberOfDimensions
    #plus 4 bytes long indicator as mentioned  
    numOfVectors = (int) (fileSize / (4 + 4*numOfDimensions))
    #init empty list for vectors
    #vectors = []
    vectors = np.zeros((numOfVectors,numOfDimensions))
    #return to the beginning
    file.seek(0)
    for vecotr in range(numOfVectors):
        file.read(4) #go trough indicator of dimensions
        #vectors.append(struct.unpack('f' * numOfDimensions, file.read(4*numOfDimensions)))
        vectors[vecotr] = struct.unpack('f' * numOfDimensions, file.read(4*numOfDimensions))
    file.close()
    return vectors

In [6]:

def get_zippedIvecs(pathToGz,memeber):
    import tarfile
    fn = pathToGz
    import struct
    import numpy as np
    t = tarfile.open(fn, 'r:gz') 
    
    
    m = t.getmember(memeber)
    file = t.extractfile(m)
    fileSize = m.size
    #file =  open(path,'rb')
    #first 4 bytes of every vector indicate number od dimensions 
    numOfDimensions = struct.unpack('i', file.read(4))[0]
    #each vector has 4 bytes (float is 32 bits) * numberOfDimensions
    #plus 4 bytes long indicator as mentioned  
    numOfVectors = (int) (fileSize / (4 + 4*numOfDimensions))
    #init empty list for vectors
    #vectors = []
    vectors = np.zeros((numOfVectors,numOfDimensions), int)
    #return to the beginning
    file.seek(0)
    for vecotr in range(numOfVectors):
        file.read(4) #go trough indicator of dimensions
        #vectors.append(struct.unpack('f' * numOfDimensions, file.read(4*numOfDimensions)))
        vectors[vecotr] = struct.unpack('i' * numOfDimensions, file.read(4*numOfDimensions))
    file.close()
    return vectors


In [7]:
def returnRecall(result, test):
    numOfTrueNeighbours = []
    #for every result vector we check how many right neighbours were identified
    for i in range(result.shape[0]):
        numTN = len(set(result[i].tolist()) & set(test[i].tolist()))
        numOfTrueNeighbours.append(numTN)
        recall = sum(numOfTrueNeighbours) /test.size
    return recall

In [8]:
def fillIfNotAllAreFound(result):
    for i in range(len(result)):
        if len(result[i]) < 100: 
            result[i].extend((100-len(result[i])) *[-1]) 
    return result

In [9]:
train = get_zippedFvecs(paths,trainPath )
#there is 100 querry ponts
query = get_zippedFvecs(paths,queryPath )
#there is index number of 100 nearset n. for each querry point
groundTruth = get_zippedIvecs(paths,groundPath )

In [10]:
rm /tmp/sift.tar.gz

In [11]:
algorithm = []
construciotnTimes=[]
searchTimes=[]
reacll = []
k = 100
avgdistances = []
constructionClocks = []
searchClocks = []
clockAlg = []

In [12]:
#Annoy
from annoy import AnnoyIndex
for trs in [5,15,30,60,80]:
    
    f = train.shape[1]
    t = AnnoyIndex(f, 'euclidean')
    
    startClock= time.clock()
    startTime = process_time()
    for i in range(train.shape[0]):
        t.add_item(i,train[i])
    t.build(trs)
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    rez = []
    dist = []
    startClock = time.clock()
    startTime = process_time()
    for q in query:
        res,d = t.get_nns_by_vector(q, 100, include_distances=True)
        rez.append(res)
        dist.append(d)
        #result.append(t.get_nns_by_vector(q, 100, include_distances=True))
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
        
    result = fillIfNotAllAreFound(rez)
    
    result = np.asanyarray(result)
    annoyRecall = returnRecall(result, groundTruth)  
    avgDist = np.mean(list(chain.from_iterable(dist)))
    
    reacll.append(annoyRecall)
    algorithm.append('Annoy-trees-'+str(trs))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    searchClocks.append(searchClock)
    constructionClocks.append(constructionClock)
    clockAlg.append('Annoy-trees-'+str(trs))
    t.save('annoyIndex90.ann')
    del t
    del rez
    del dist
    del result
    gc.collect()
#______________________________________________#


In [13]:
import nmslib
for MMAX in [5,8,15,30,38]:
    hnsw = nmslib.init(method='hnsw', space='l2')
    
    startClock = time.clock()
    startTime = process_time()
    hnsw.addDataPointBatch(train)
    hnsw.createIndex({'delaunay_type':1, 'M':MMAX})
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    
    # get all nearest neighbours for all the datapoint
    # using a pool of 4 threads to compute
    startClock = time.clock()
    startTime = process_time()
    neighbours = hnsw.knnQueryBatch(query, k=100, num_threads=2)
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    rez =[]
    dist =[]
    for i in neighbours:
        rez.append(list(i[0]))
        dist.append(list(i[1]))
    
    result = fillIfNotAllAreFound(rez)
      
    result = np.array(rez)
    hnswRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(np.sqrt(list(chain.from_iterable(dist))))
    
    reacll.append(hnswRecall)
    algorithm.append('HNSW-M-'+str(MMAX))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    constructionClocks.append(constructionClock)
    searchClocks.append(searchClock)
    clockAlg.append('HNSW-M-'+str(MMAX))
    
    del hnsw
    del rez
    del dist
    del result
    del neighbours
    gc.collect()


In [14]:
import nmslib
vptree = nmslib.init(method='vptree', space='l2')

startClock = time.clock()
startTime = process_time()
vptree.addDataPointBatch(train)
vptree.createIndex({'bucketSize' : 10000,'selectPivotAttempts':10})
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
for maxLeave in [2,10,20,25,30]:
  
    vptree.setQueryTimeParams({'maxLeavesToVisit':maxLeave,'alphaLeft':1.1,'alphaRight':1.1})
    startClock = time.clock()
    startTime = process_time()
    neighbours = vptree.knnQueryBatch(query,k=100, num_threads=2 )
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    
    rez =[]
    dist = []
    for i in neighbours:
        rez.append(list(i[0]))
        dist.append(list(i[1]))
        
    rez = fillIfNotAllAreFound(rez)    
    
    result = np.asanyarray(rez)
    
    vptreeRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(list(chain.from_iterable(dist)))
    
    reacll.append(vptreeRecall)
    algorithm.append('vp-Tree-10k-mL'+str(maxLeave))
    #algorithm.append('vp-Tree-maxLeaves'+str(maxLeaves))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    constructionClocks.append(constructionClock)
    searchClocks.append(searchClock)
    avgdistances.append(avgDist)
    del rez
    del dist
    del result
    gc.collect()

#vptree.saveIndex('vptreeIndex.ann')    
del vptree
gc.collect()



In [15]:
compareResults = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [16]:
compareResults

Unnamed: 0,algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
0,Annoy-trees-5,35.925213,1.785309,0.283496,254.695136,35.925349,1.785417
1,Annoy-trees-15,62.3893,4.010613,0.522061,243.330537,62.389346,4.010644
2,Annoy-trees-30,102.713781,7.045646,0.684114,239.467808,102.713839,7.045668
3,Annoy-trees-60,185.833591,11.553568,0.81582,237.402157,185.833631,11.553589
4,Annoy-trees-80,244.179122,14.592285,0.857391,236.893282,244.179166,14.592305
5,HNSW-M-5,633.889558,1.161902,0.362857,251.358978,633.889681,1.16214
6,HNSW-M-8,856.312431,1.43825,0.482229,244.77327,856.312482,1.438297
7,HNSW-M-15,1347.664978,2.08432,0.624448,240.395767,1347.665029,2.08436
8,HNSW-M-30,2519.088754,3.152285,0.766756,237.858261,2519.088794,3.15233
9,HNSW-M-38,3269.567375,3.626972,0.806364,237.350479,3269.567412,3.627011


In [17]:
display(compareResults)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [18]:
from sklearn.neighbors import NearestNeighbors

startClock = time.clock()
startTime = process_time() 
nbrs = NearestNeighbors(n_neighbors=100, algorithm='brute').fit(train)
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

startClock = time.clock()
startTime = process_time() 
dist,result = nbrs.kneighbors(query,return_distance=True)
end_time = process_time()
searchTime = end_time - startTime
endClock = time.clock()
searchClock= endClock - startClock


bruteRecall = returnRecall(result, groundTruth)
avgDist = np.mean(np.mean(dist,axis=1))


reacll.append(bruteRecall)
algorithm.append('linear force')
construciotnTimes.append(constructionTime)
searchTimes.append(searchTime)
avgdistances.append(avgDist)
constructionClocks.append(constructionClock)
searchClocks.append(searchClock)


In [19]:
from sklearn.neighbors import KDTree

startClock = time.clock()
startTime = process_time()
kdt = KDTree(train, metric='euclidean', leaf_size = 10000)
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

startTime = process_time()
dist, result = kdt.query(query, k=100, return_distance=True)
end_time = process_time()
searchTime = end_time - startTime
endClock = time.clock()
searchClock= endClock - startClock

kdTreeRecall = returnRecall(result, groundTruth)
avgDist = np.mean(dist)
ktreeparams = kdt.get_tree_stats()
reacll.append(kdTreeRecall)
algorithm.append('k-D')
construciotnTimes.append(constructionTime)
searchTimes.append(searchTime)
avgdistances.append(avgDist)
constructionClocks.append(constructionClock)
searchClocks.append(searchClock)


#BallTree
from sklearn.neighbors import BallTree

startClock = time.clock()
startTime = process_time()
bt = BallTree(train, metric='euclidean')
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

startTime = process_time()
dist, result = bt.query(query, k=100, return_distance=True)
end_time = process_time()
searchTime = end_time - startTime
endClock = time.clock()
searchClock= endClock - startClock

ballTreeRecall = returnRecall(result, groundTruth)
avgDist = np.mean(dist)
ballTreeparams = bt.get_tree_stats()
reacll.append(ballTreeRecall)
algorithm.append('ball-tree')
construciotnTimes.append(constructionTime)
searchTimes.append(searchTime)
avgdistances.append(avgDist)
constructionClocks.append(constructionClock)
searchClocks.append(searchClock)


In [20]:
newComparison = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [21]:
newComparison

Unnamed: 0,algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
0,Annoy-trees-5,35.925213,1.785309,0.283496,254.695136,35.925349,1.785417
1,Annoy-trees-15,62.3893,4.010613,0.522061,243.330537,62.389346,4.010644
2,Annoy-trees-30,102.713781,7.045646,0.684114,239.467808,102.713839,7.045668
3,Annoy-trees-60,185.833591,11.553568,0.81582,237.402157,185.833631,11.553589
4,Annoy-trees-80,244.179122,14.592285,0.857391,236.893282,244.179166,14.592305
5,HNSW-M-5,633.889558,1.161902,0.362857,251.358978,633.889681,1.16214
6,HNSW-M-8,856.312431,1.43825,0.482229,244.77327,856.312482,1.438297
7,HNSW-M-15,1347.664978,2.08432,0.624448,240.395767,1347.665029,2.08436
8,HNSW-M-30,2519.088754,3.152285,0.766756,237.858261,2519.088794,3.15233
9,HNSW-M-38,3269.567375,3.626972,0.806364,237.350479,3269.567412,3.627011


In [22]:
display(newComparison)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [23]:
newestComparison = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [24]:
newestComparison

Unnamed: 0,algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
0,Annoy-trees-5,35.925213,1.785309,0.283496,254.695136,35.925349,1.785417
1,Annoy-trees-15,62.3893,4.010613,0.522061,243.330537,62.389346,4.010644
2,Annoy-trees-30,102.713781,7.045646,0.684114,239.467808,102.713839,7.045668
3,Annoy-trees-60,185.833591,11.553568,0.81582,237.402157,185.833631,11.553589
4,Annoy-trees-80,244.179122,14.592285,0.857391,236.893282,244.179166,14.592305
5,HNSW-M-5,633.889558,1.161902,0.362857,251.358978,633.889681,1.16214
6,HNSW-M-8,856.312431,1.43825,0.482229,244.77327,856.312482,1.438297
7,HNSW-M-15,1347.664978,2.08432,0.624448,240.395767,1347.665029,2.08436
8,HNSW-M-30,2519.088754,3.152285,0.766756,237.858261,2519.088794,3.15233
9,HNSW-M-38,3269.567375,3.626972,0.806364,237.350479,3269.567412,3.627011


In [25]:
display(newestComparison)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [26]:
del kdt
del bt
gc.collect()

In [28]:
del vptree
del nbrs
gc.collect()

In [29]:
from pyflann import *
para = []
for tp in [0.3,0.6,0.7,0.85]:
    
    flann = FLANN()
    set_distance_type('euclidean')

    
    startClock = time.clock()
    startTime = process_time()
    flannparams = flann.build_index(train, algorithm ='autotuned', target_precision=tp, build_weight=0 ,memory_weight=0, sample_fraction=0.01)
    #params = flann.build_index(train, algorithm ='', trees = 30)
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    startClock = time.clock()
    startTime = process_time()
    result, dist = flann.nn_index(query, 100)
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    result = fillIfNotAllAreFound(result)    
    
    result = np.asanyarray(result)
    
    rkdDflannRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(np.sqrt(dist))
    para.append(flannparams)
 
    reacll.append(rkdDflannRecall)
    algorithm.append(flannparams['algorithm']+'-flann-build005')
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    searchClocks.append(searchClock)
    constructionClocks.append(constructionClock)

    

In [30]:
cc = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [31]:
cc

Unnamed: 0,algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
0,Annoy-trees-5,35.925213,1.785309,0.283496,254.695136,35.925349,1.785417
1,Annoy-trees-15,62.3893,4.010613,0.522061,243.330537,62.389346,4.010644
2,Annoy-trees-30,102.713781,7.045646,0.684114,239.467808,102.713839,7.045668
3,Annoy-trees-60,185.833591,11.553568,0.81582,237.402157,185.833631,11.553589
4,Annoy-trees-80,244.179122,14.592285,0.857391,236.893282,244.179166,14.592305
5,HNSW-M-5,633.889558,1.161902,0.362857,251.358978,633.889681,1.16214
6,HNSW-M-8,856.312431,1.43825,0.482229,244.77327,856.312482,1.438297
7,HNSW-M-15,1347.664978,2.08432,0.624448,240.395767,1347.665029,2.08436
8,HNSW-M-30,2519.088754,3.152285,0.766756,237.858261,2519.088794,3.15233
9,HNSW-M-38,3269.567375,3.626972,0.806364,237.350479,3269.567412,3.627011


In [32]:
para

In [33]:
display(cc)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [35]:
import falconn 
par = falconn.LSHConstructionParameters()
param = falconn.get_default_parameters(num_points = len(train), dimension = len(train[0]), distance = falconn.DistanceFunction.EuclideanSquared )
print(param.lsh_family, param.l, param.k)
tables = param.l
hashes = param.k
param.l = 30
para = []

for k in [int(hashes*1.5)]:#[hashes,int(hashes*1.5)]:

      
    startClock = time.clock()
    startTime = process_time()
    param.k = k
    lsh = falconn.LSHIndex(param)
    lsh.setup(train)
    indexlsh = lsh.construct_query_object()
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
  
    for t in [param.l, int(param.l*2), int(param.l*3),int(param.l*5)]:
        
        indexlsh.set_num_probes(t)
        print('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t))

        rez = []
        startClock = time.clock()
        startTime = process_time()
        for q in query:
            res = indexlsh.find_k_nearest_neighbors(q, 100)
            rez.append(res)
        
        end_time = process_time()
        searchTime = end_time - startTime
        endClock = time.clock()
        searchClock= endClock - startClock
          
        result = fillIfNotAllAreFound(rez) 
        result = np.asanyarray(result)
        lshReacll = returnRecall(result, groundTruth)
        avgDist = 0
      
        para.append(param)
        reacll.append(lshReacll)
        algorithm.append('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t))
        construciotnTimes.append(constructionTime)
        searchTimes.append(searchTime)
        avgdistances.append(avgDist)
        searchClocks.append(searchClock)
        constructionClocks.append(constructionClock)
       

In [36]:
para

In [37]:
poboljsaniLSH = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})
display(poboljsaniLSH)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
lsh-l30k4t30,271.972700738,79.64140974200001,0.747726,0,271.972808,79.64151500000003
lsh-l30k4t60,271.972700738,121.09292509899996,0.848569,0,271.972808,121.092964
lsh-l30k4t90,271.972700738,149.08656326999994,0.895185,0,271.972808,149.086596
lsh-l30k4t150,271.972700738,191.076414013,0.938571,0,271.972808,191.07644900000005


In [38]:
display(proveraLSH2)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
lsh-l11k3t11,0.0024736520000061,0.0159324349999963,0.752697,0,0.0025420000000053,0.0159639999999967
lsh-l11k3t22,0.0024736520000061,0.0213098459999514,0.863618,0,0.0025420000000053,0.0213370000000168
lsh-l11k3t33,0.0024736520000061,0.0198753070000066,0.910783,0,0.0025420000000053,0.019896000000017
lsh-l11k4t11,0.0039320779999343,0.0031262520000154,0.47131,0,0.0039649999999937,0.0031600000000935
lsh-l11k4t22,0.0039320779999343,0.0045100349999529,0.594254,0,0.0039649999999937,0.0045240000000603
lsh-l11k4t33,0.0039320779999343,0.0048694909999085,0.666236,0,0.0039649999999937,0.0048839999999472
lsh-l11k3t11,43.37625548799997,0.0119298489998982,0.752697,0,43.37638599999991,0.0119499999998424
lsh-l11k3t22,43.37625548799997,0.0179355060001853,0.863618,0,43.37638599999991,0.017955000000029
lsh-l11k3t33,43.37625548799997,0.0206418460002169,0.910783,0,43.37638599999991,0.0206630000000132
lsh-l11k4t11,97.128795329,0.0031401460000779,0.47131,0,97.1288199999999,0.003152


In [39]:
proveraLSH = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})
proveraLSH

Unnamed: 0,algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
0,lsh-l11k3t11,0.002474,0.015932,0.752697,0,0.002542,0.015964
1,lsh-l11k3t22,0.002474,0.02131,0.863618,0,0.002542,0.021337
2,lsh-l11k3t33,0.002474,0.019875,0.910783,0,0.002542,0.019896
3,lsh-l11k4t11,0.003932,0.003126,0.47131,0,0.003965,0.00316
4,lsh-l11k4t22,0.003932,0.00451,0.594254,0,0.003965,0.004524
5,lsh-l11k4t33,0.003932,0.004869,0.666236,0,0.003965,0.004884


In [40]:
sift1MFinal = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [41]:
display(sift1MFinal)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [42]:
gc.collect()

In [43]:
para