## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [2]:
import numpy as np
import os
import struct
import pandas as pd
import time
from time import process_time
from itertools import chain 
import gc

In [3]:
dataset = 'sift'

if dataset == 'sift':

if dataset == 'smallSift':
  url = 'ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz'
  paths =  '/tmp/siftsmall.tar.gz' 
  trainPath = 'siftsmall/siftsmall_base.fvecs'
  queryPath = 'siftsmall/siftsmall_query.fvecs'
  groundPath = 'siftsmall/siftsmall_groundtruth.ivecs'
  

In [4]:
  trainurl = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz'
  trainMember = 'train-images-idx3-ubyte.gz'
  testurl = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz'
  testnMember = 'test-images-idx3-ubyte.gz'
  

In [5]:
import urllib 
urllib.request.urlretrieve(trainurl, 'train-images-idx3-ubyte.gz')



In [6]:
urllib.request.urlretrieve(testurl, 'test-images-idx3-ubyte.gz')

In [7]:
def load_mnist(fn):
    import gzip
    import struct
    import numpy

    print('parsing vectors in %s...' % fn)
    f = gzip.open(fn)
    type_code_info = {
        0x08: (1, "!B"),
        0x09: (1, "!b"),
        0x0B: (2, "!H"),
        0x0C: (4, "!I"),
        0x0D: (4, "!f"),
        0x0E: (8, "!d")
    }
    magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
    assert magic == 0
    assert type_code in type_code_info

    dimensions = [struct.unpack("!I", f.read(4))[0]
                  for i in range(dim_count)]

    entry_count = dimensions[0]
    entry_size = numpy.product(dimensions[1:])

    b, format_string = type_code_info[type_code]
    vectors = []
    for i in range(entry_count):
        vectors.append([struct.unpack(format_string, f.read(b))[0]
                        for j in range(entry_size)])
    return numpy.array(vectors)

In [8]:

train = load_mnist('train-images-idx3-ubyte.gz')

In [9]:
test = load_mnist('test-images-idx3-ubyte.gz')

In [10]:
query = test


In [11]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=100, algorithm='brute').fit(train)
dist,groundTruth = nbrs.kneighbors(query,return_distance=True)

In [12]:
len(groundTruth)

In [13]:
def returnRecall(result, test):
    numOfTrueNeighbours = []
    #for every result vector we check how many right neighbours were identified
    for i in range(result.shape[0]):
        numTN = len(set(result[i].tolist()) & set(test[i].tolist()))
        numOfTrueNeighbours.append(numTN)
        recall = sum(numOfTrueNeighbours) /test.size
    return recall

In [14]:
def fillIfNotAllAreFound(result):
    for i in range(len(result)):
        if len(result[i]) < 100: 
            result[i].extend((100-len(result[i])) *[-1]) 
    return result

In [15]:
algorithm = []
construciotnTimes=[]
searchTimes=[]
reacll = []
k = 100
avgdistances = []
constructionClocks = []
searchClocks = []
clockAlg = []

In [16]:
#Annoy
from annoy import AnnoyIndex
for trs in [5,15,30,60,80]:
    
    f = train.shape[1]
    t = AnnoyIndex(f, 'euclidean')
    
    startClock= time.clock()
    startTime = process_time()
    for i in range(train.shape[0]):
        t.add_item(i,train[i])
    t.build(trs)
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    rez = []
    dist = []
    startClock = time.clock()
    startTime = process_time()
    for q in query:
        res,d = t.get_nns_by_vector(q, 100, include_distances=True)
        rez.append(res)
        dist.append(d)
        #result.append(t.get_nns_by_vector(q, 100, include_distances=True))
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
        
    result = fillIfNotAllAreFound(rez)
    
    result = np.asanyarray(result)
    annoyRecall = returnRecall(result, groundTruth)  
    avgDist = np.mean(list(chain.from_iterable(dist)))
    
    reacll.append(annoyRecall)
    algorithm.append('Annoy-trees-'+str(trs))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    searchClocks.append(searchClock)
    constructionClocks.append(constructionClock)
    clockAlg.append('Annoy-trees-'+str(trs))
    t.save('annoyIndex90.ann')
    del t
    del rez
    del dist
    del result
    gc.collect()
#______________________________________________#


In [17]:
train

In [18]:
import falconn 
par = falconn.LSHConstructionParameters()
param = falconn.get_default_parameters(num_points = len(train), dimension = len(train[0]), distance = falconn.DistanceFunction.EuclideanSquared )
print(param.lsh_family, param.l, param.k)
tables = param.l
hashes = param.k
param.l = int(1.1*tables)
para = []

for k in [hashes,int(hashes*1.5)]:

      
    startClock = time.clock()
    startTime = process_time()
    param.k = k
    lsh = falconn.LSHIndex(param)
    lsh.setup(train.astype(float))
    indexlsh = lsh.construct_query_object()
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
  
    for t in [param.l, int(param.l*2), int(param.l*3)]:
        
        indexlsh.set_num_probes(t)
        print('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t))

        rez = []
        startClock = time.clock()
        startTime = process_time()
        for q in query.astype(float):
            res = indexlsh.find_k_nearest_neighbors(q, 100)
            rez.append(res)
        
        end_time = process_time()
        searchTime = end_time - startTime
        endClock = time.clock()
        searchClock= endClock - startClock
          
        result = fillIfNotAllAreFound(rez) 
        result = np.asanyarray(result)
        lshReacll = returnRecall(result, groundTruth)
        avgDist = 0
      
        para.append(param)
        reacll.append(lshReacll)
        algorithm.append('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t))
        construciotnTimes.append(constructionTime)
        searchTimes.append(searchTime)
        avgdistances.append(avgDist)
        searchClocks.append(searchClock)
        constructionClocks.append(constructionClock)
       

In [19]:
compareResults = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [20]:
display(compareResults)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,8.982965498999988,4.797598034000032,0.67838,1241.721611303587,8.983090000000004,4.797702000000015
Annoy-trees-15,10.87679562599999,7.219462340000007,0.86692,1215.5960190068952,10.876841000000011,7.219483999999966
Annoy-trees-30,13.314588160000028,9.834121665999987,0.932996,1209.9398430200176,13.314630000000022,9.834142000000044
Annoy-trees-60,18.17091997199998,13.355988042999968,0.967775,1207.600877423704,18.17096200000003,13.356008000000031
Annoy-trees-80,21.322231387000045,15.746612435000031,0.976586,1207.0850207127055,21.32227499999999,15.746634000000029
lsh-l11k2t11,13.271754456000052,139.51641106499994,0.905265,0.0,13.271883000000004,139.516521
lsh-l11k2t22,13.271754456000052,177.62432485199997,0.940478,0.0,13.271883000000004,177.624368
lsh-l11k2t33,13.271754456000052,200.03349291200004,0.955608,0.0,13.271883000000004,200.03353199999992
lsh-l11k3t11,38.793524296999976,39.69285104299991,0.655829,0.0,38.79355099999998,39.69289900000001
lsh-l11k3t22,38.793524296999976,56.65326545800008,0.733962,0.0,38.79355099999998,56.65331100000003


In [21]:
import nmslib
for MMAX in [5,8,15,30,38]:
    hnsw = nmslib.init(method='hnsw', space='l2')
    
    startClock = time.clock()
    startTime = process_time()
    hnsw.addDataPointBatch(train)
    hnsw.createIndex({'delaunay_type':1, 'M':MMAX})
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    
    # get all nearest neighbours for all the datapoint
    # using a pool of 4 threads to compute
    startClock = time.clock()
    startTime = process_time()
    neighbours = hnsw.knnQueryBatch(query, k=100, num_threads=2)
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    rez =[]
    dist =[]
    for i in neighbours:
        rez.append(list(i[0]))
        dist.append(list(i[1]))
    
    result = fillIfNotAllAreFound(rez)
      
    result = np.array(rez)
    hnswRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(np.sqrt(list(chain.from_iterable(dist))))
    
    reacll.append(hnswRecall)
    algorithm.append('HNSW-M-'+str(MMAX))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    constructionClocks.append(constructionClock)
    searchClocks.append(searchClock)
    clockAlg.append('HNSW-M-'+str(MMAX))
    
    del hnsw
    del rez
    del dist
    del result
    del neighbours
    gc.collect()


In [22]:
from sklearn.neighbors import NearestNeighbors

startClock = time.clock()
startTime = process_time() 
nbrs = NearestNeighbors(n_neighbors=100, algorithm='brute').fit(train)
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

startClock = time.clock()
startTime = process_time() 
dist,result = nbrs.kneighbors(query,return_distance=True)
end_time = process_time()
searchTime = end_time - startTime
endClock = time.clock()
searchClock= endClock - startClock


bruteRecall = returnRecall(result, groundTruth)
avgDist = np.mean(np.mean(dist,axis=1))


reacll.append(bruteRecall)
algorithm.append('linear force')
construciotnTimes.append(constructionTime)
searchTimes.append(searchTime)
avgdistances.append(avgDist)
constructionClocks.append(constructionClock)
searchClocks.append(searchClock)


In [23]:
newComparison = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [24]:
display(newComparison)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,8.982965498999988,4.797598034000032,0.67838,1241.721611303587,8.983090000000004,4.797702000000015
Annoy-trees-15,10.87679562599999,7.219462340000007,0.86692,1215.5960190068952,10.876841000000011,7.219483999999966
Annoy-trees-30,13.314588160000028,9.834121665999987,0.932996,1209.9398430200176,13.314630000000022,9.834142000000044
Annoy-trees-60,18.17091997199998,13.355988042999968,0.967775,1207.600877423704,18.17096200000003,13.356008000000031
Annoy-trees-80,21.322231387000045,15.746612435000031,0.976586,1207.0850207127055,21.32227499999999,15.746634000000029
lsh-l11k2t11,13.271754456000052,139.51641106499994,0.905265,0.0,13.271883000000004,139.516521
lsh-l11k2t22,13.271754456000052,177.62432485199997,0.940478,0.0,13.271883000000004,177.624368
lsh-l11k2t33,13.271754456000052,200.03349291200004,0.955608,0.0,13.271883000000004,200.03353199999992
lsh-l11k3t11,38.793524296999976,39.69285104299991,0.655829,0.0,38.79355099999998,39.69289900000001
lsh-l11k3t22,38.793524296999976,56.65326545800008,0.733962,0.0,38.79355099999998,56.65331100000003


In [25]:
display(cc)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,35.925213361,1.7853086229999915,0.283496,254.6951362941704,35.925349000000004,1.7854169999999954
Annoy-trees-15,62.389300217,4.010613428999989,0.522061,243.330537024456,62.38934599999999,4.010643999999985
Annoy-trees-30,102.71378096700002,7.045646438999995,0.684114,239.46780847164536,102.71383899999998,7.045667999999978
Annoy-trees-60,185.833590822,11.55356832800004,0.81582,237.40215683092495,185.83363100000005,11.553588999999988
Annoy-trees-80,244.17912214200004,14.592285040000093,0.857391,236.8932823667984,244.179166,14.59230500000001
HNSW-M-5,633.889558266,1.161902234000081,0.362857,251.3589782714844,633.889681,1.1621399999999085
HNSW-M-8,856.3124313319997,1.4382503929996346,0.482229,244.7732696533203,856.3124820000003,1.4382970000001478
HNSW-M-15,1347.6649782549998,2.0843196880000505,0.624448,240.3957672119141,1347.6650290000002,2.084359999999833
HNSW-M-30,2519.088754243,3.1522851899999296,0.766756,237.85826110839844,2519.088794,3.1523299999998926
HNSW-M-38,3269.567374601,3.626972136000404,0.806364,237.35047912597656,3269.5674119999994,3.627011000000493


In [26]:
import nmslib
vptree = nmslib.init(method='vptree', space='l2')

startClock = time.clock()
startTime = process_time()
vptree.addDataPointBatch(train)
vptree.createIndex({'bucketSize' : 10000,'selectPivotAttempts':10})
end_time = process_time()
constructionTime = end_time - startTime
endClock = time.clock()
constructionClock= endClock - startClock

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
for maxLeave in [2,10,20,25,30]:
  
    vptree.setQueryTimeParams({'maxLeavesToVisit':maxLeave,'alphaLeft':1.1,'alphaRight':1.1})
    startClock = time.clock()
    startTime = process_time()
    neighbours = vptree.knnQueryBatch(query,k=100, num_threads=2 )
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    
    rez =[]
    dist = []
    for i in neighbours:
        rez.append(list(i[0]))
        dist.append(list(i[1]))
        
    rez = fillIfNotAllAreFound(rez)    
    
    result = np.asanyarray(rez)
    
    vptreeRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(list(chain.from_iterable(dist)))
    
    reacll.append(vptreeRecall)
    algorithm.append('vp-Tree-10k-mL'+str(maxLeave))
    #algorithm.append('vp-Tree-maxLeaves'+str(maxLeaves))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    constructionClocks.append(constructionClock)
    searchClocks.append(searchClock)
    avgdistances.append(avgDist)
    del rez
    del dist
    del result
    gc.collect()

#vptree.saveIndex('vptreeIndex.ann')    
del vptree
gc.collect()



In [27]:
sift = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [28]:
display(sift)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,8.982965498999988,4.797598034000032,0.67838,1241.721611303587,8.983090000000004,4.797702000000015
Annoy-trees-15,10.87679562599999,7.219462340000007,0.86692,1215.5960190068952,10.876841000000011,7.219483999999966
Annoy-trees-30,13.314588160000028,9.834121665999987,0.932996,1209.9398430200176,13.314630000000022,9.834142000000044
Annoy-trees-60,18.17091997199998,13.355988042999968,0.967775,1207.600877423704,18.17096200000003,13.356008000000031
Annoy-trees-80,21.322231387000045,15.746612435000031,0.976586,1207.0850207127055,21.32227499999999,15.746634000000029
lsh-l11k2t11,13.271754456000052,139.51641106499994,0.905265,0.0,13.271883000000004,139.516521
lsh-l11k2t22,13.271754456000052,177.62432485199997,0.940478,0.0,13.271883000000004,177.624368
lsh-l11k2t33,13.271754456000052,200.03349291200004,0.955608,0.0,13.271883000000004,200.03353199999992
lsh-l11k3t11,38.793524296999976,39.69285104299991,0.655829,0.0,38.79355099999998,39.69289900000001
lsh-l11k3t22,38.793524296999976,56.65326545800008,0.733962,0.0,38.79355099999998,56.65331100000003


In [29]:

k = 100

searchKparam = []
numTreesParam = []

clockAlg = []

for example in [(trees, search) for trees in [1,5,10,30] for search in [0.8,0.9,1.1,1.2]]:
    numTrees = example[0]
    searchK = int(example[1] * k * numTrees)
    
    
    numTreesParam.append(numTrees)
    searchKparam.append(searchK)
    
    f = train.shape[1]
    t = AnnoyIndex(f, 'euclidean')
    
    startClock= time.clock()
    startTime = process_time()
    for i in range(train.shape[0]):
        t.add_item(i,train[i])
    t.build(numTrees)
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    rez = []
    dist = []
    startClock = time.clock()
    startTime = process_time()
    for q in query:
        res,d = t.get_nns_by_vector(q, 100, search_k = searchK, include_distances=True)
        rez.append(res)
        dist.append(d)
        #result.append(t.get_nns_by_vector(q, 100, include_distances=True))
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
        
    result = fillIfNotAllAreFound(rez)
    
    result = np.asanyarray(result)
    annoyRecall = returnRecall(result, groundTruth)  
    avgDist = np.mean(list(chain.from_iterable(dist)))
    
    reacll.append(annoyRecall)
    algorithm.append('Annoy-trees-'+str(numTrees))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    searchClocks.append(searchClock)
    constructionClocks.append(constructionClock)
    del t
    del rez
    del dist
    del result
    gc.collect()
    
    


In [30]:
annoy = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [31]:
display(annoy)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,8.982965498999988,4.797598034000032,0.67838,1241.721611303587,8.983090000000004,4.797702000000015
Annoy-trees-15,10.87679562599999,7.219462340000007,0.86692,1215.5960190068952,10.876841000000011,7.219483999999966
Annoy-trees-30,13.314588160000028,9.834121665999987,0.932996,1209.9398430200176,13.314630000000022,9.834142000000044
Annoy-trees-60,18.17091997199998,13.355988042999968,0.967775,1207.600877423704,18.17096200000003,13.356008000000031
Annoy-trees-80,21.322231387000045,15.746612435000031,0.976586,1207.0850207127055,21.32227499999999,15.746634000000029
lsh-l11k2t11,13.271754456000052,139.51641106499994,0.905265,0.0,13.271883000000004,139.516521
lsh-l11k2t22,13.271754456000052,177.62432485199997,0.940478,0.0,13.271883000000004,177.624368
lsh-l11k2t33,13.271754456000052,200.03349291200004,0.955608,0.0,13.271883000000004,200.03353199999992
lsh-l11k3t11,38.793524296999976,39.69285104299991,0.655829,0.0,38.79355099999998,39.69289900000001
lsh-l11k3t22,38.793524296999976,56.65326545800008,0.733962,0.0,38.79355099999998,56.65331100000003


In [32]:

MMAXparam = []
dgraphParam = []


import nmslib

for example in  [(dgraph, MMAX) for dgraph in [0,1,2,3] for MMAX in [2,4,6,8,10,12]]:
    
    hnsw = nmslib.init(method='hnsw', space='l2')
    
    dgraph = example[0]
    MMAX = example[1]
    
    MMAXparam.append(example[1])
    dgraphParam.append(example[0])
    
    startClock = time.clock()
    startTime = process_time()
    hnsw.addDataPointBatch(train)
    hnsw.createIndex({'delaunay_type':dgraph, 'M':MMAX})
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    
    # get all nearest neighbours for all the datapoint
    # using a pool of 4 threads to compute
    startClock = time.clock()
    startTime = process_time()
    neighbours = hnsw.knnQueryBatch(query, k=100, num_threads=2)
    end_time = process_time()
    searchTime = end_time - startTime
    endClock = time.clock()
    searchClock= endClock - startClock
    
    rez =[]
    dist =[]
    for i in neighbours:
        rez.append(list(i[0]))
        dist.append(list(i[1]))
    
    result = fillIfNotAllAreFound(rez)
      
    result = np.array(rez)
    hnswRecall = returnRecall(result, groundTruth)
    avgDist = np.mean(np.sqrt(list(chain.from_iterable(dist))))
    
    reacll.append(hnswRecall)
    algorithm.append('HNSW-M-'+str(MMAX))
    construciotnTimes.append(constructionTime)
    searchTimes.append(searchTime)
    avgdistances.append(avgDist)
    constructionClocks.append(constructionClock)
    searchClocks.append(searchClock)
    del hnsw
    del rez
    del dist
    del result
    del neighbours
    gc.collect()

In [33]:
hnsw = pd.DataFrame({ 'algorithm':algorithm, 'constructionTime':construciotnTimes, 'searchTime':searchTimes,'recall':reacll,'avgDistance':avgdistances, 'constructionClocks':constructionClocks,'searchClocks':searchClocks})

In [34]:
display(hnsw)

algorithm,constructionTime,searchTime,recall,avgDistance,constructionClocks,searchClocks
Annoy-trees-5,8.982965498999988,4.797598034000032,0.67838,1241.721611303587,8.983090000000004,4.797702000000015
Annoy-trees-15,10.87679562599999,7.219462340000007,0.86692,1215.5960190068952,10.876841000000011,7.219483999999966
Annoy-trees-30,13.314588160000028,9.834121665999987,0.932996,1209.9398430200176,13.314630000000022,9.834142000000044
Annoy-trees-60,18.17091997199998,13.355988042999968,0.967775,1207.600877423704,18.17096200000003,13.356008000000031
Annoy-trees-80,21.322231387000045,15.746612435000031,0.976586,1207.0850207127055,21.32227499999999,15.746634000000029
lsh-l11k2t11,13.271754456000052,139.51641106499994,0.905265,0.0,13.271883000000004,139.516521
lsh-l11k2t22,13.271754456000052,177.62432485199997,0.940478,0.0,13.271883000000004,177.624368
lsh-l11k2t33,13.271754456000052,200.03349291200004,0.955608,0.0,13.271883000000004,200.03353199999992
lsh-l11k3t11,38.793524296999976,39.69285104299991,0.655829,0.0,38.79355099999998,39.69289900000001
lsh-l11k3t22,38.793524296999976,56.65326545800008,0.733962,0.0,38.79355099999998,56.65331100000003


In [35]:
from memory_profiler import profile
 
@profile
def abc(): 
    

    

In [36]:
    f = train.shape[1]
    t = AnnoyIndex(f, 'euclidean')
    for i in range(train.shape[0]):
        t.add_item(i,train[i])
   

In [37]:
t.build(10)

In [38]:
sys.getsizeof(t)

In [39]:
parahnsw = pd.DataFrame({'MMAXparam':MMAXparam,
'dgraphParam':dgraphParam})

In [40]:
display(parahnsw)

MMAXparam,dgraphParam
2,0
4,0
6,0
8,0
10,0
12,0
2,1
4,1
6,1
8,1


In [41]:


anpar = pd.DataFrame({'searchKparam':searchKparam,
'numTreesParam':numTreesParam})
display(anpar)

searchKparam,numTreesParam
80,1
90,1
110,1
120,1
400,5
450,5
550,5
600,5
800,10
900,10
