# IndexHNSWFlat (Hierarchical Navigable Small World graph exploration)
The IndexHNSW uses a flat index as underlying storage to quickly access the database vectors and abstract the compression / decompression of vectors. HNSW depends on a few important parameters:

- M is the number of neighbors used in the graph. A larger M is more accurate but uses more memory
- efConstruction is the depth of exploration at add time
- efSearch is the depth of exploration of the search

https://github.com/facebookresearch/faiss/wiki/Faiss-indexes


In [37]:
import faiss                   
import numpy as np
import pandas as pd
import time
import tlsh
from tools import *

d = 70                         # dimension
k = 3                          # we want to see K nearest neighbors
M = 10                         # number of neighbors used in the graph. A larger M is more accurate but uses more memory

## Dummy Data Testing

In [21]:
xb = gen_data(10000, d)
xb[:1]

array([[ 9.,  6.,  4.,  5.,  9.,  0.,  3.,  3.,  1., 15.,  0., 15., 11.,
         7.,  7.,  6., 10., 11., 10., 14.,  2.,  1.,  7.,  3.,  3.,  7.,
         7.,  2., 15.,  3.,  2.,  3.,  3.,  2.,  2.,  4.,  3.,  5.,  8.,
        11., 11., 14., 12., 10., 13.,  4., 13.,  3., 13., 14.,  4., 10.,
        15.,  6.,  4.,  4., 11.,  1., 13.,  9.,  8.,  1., 13., 11., 14.,
        12., 13.,  3.,  2., 14.]], dtype=float32)

In [22]:
index = faiss.IndexHNSWFlat(d, M)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
# print(D[:1])

[[   0 7413 9948]
 [   1 4230 4577]
 [   2 2959 8332]
 [   3 6691 7357]
 [   4 7459 9390]]


In [4]:
evaluate(I, pd.DataFrame(xb), k)

 recall at index:  0.8365,  missing rate: 0.0000


In [5]:
xb = (xb/15)

index = faiss.IndexHNSWFlat(d, M)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
# print(D)

[[   0 6321 4788]
 [   1 5371 6247]
 [   2 6707 8238]
 [1170 2303 7737]
 [   4 8390 6516]]


In [6]:
evaluate(I, pd.DataFrame(xb), k)

 recall at index:  0.8327,  missing rate: 0.0000


## Malware Data Testing

In [34]:
df=pd.read_csv("../data/malware_bazaar.csv",header=0)
print(df.shape)
df[:1]

(132134, 3)


Unnamed: 0,sha1_hash,tlsh,signature
0,003411d0a9610cfe8a027a364b46c489fa034502,AF74AD89B6257A65DE3A727411C78FC1B994D007602253...,Quakbot


### 70 Hex chars as Input

In [15]:
xb = df['tlsh'].apply(convert_to_array)
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

(132134, 70)


In [9]:
index = faiss.IndexHNSWFlat(d, M)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

[[     0  55801  85151]
 [     1  24118  54676]
 [     2 116970  60316]
 [110150      3 115061]
 [ 45416 110092      4]]


In [10]:
evaluate(I, df, k, True)

 recall at index:  0.8136,  missing rate: 0.0000
 recall at label:  0.9264


### 140 Buckets as Input

In [38]:
xb = df['tlsh'].apply(to_hist, args=(140,))
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

index = faiss.IndexHNSWFlat(d*2, M)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

(132134, 140)
[[     0  97650  67487]
 [     1  24118  25610]
 [     2  22140  70506]
 [110150      3 115061]
 [ 45416 110092      4]]


In [12]:
evaluate(I, df, k, True)

 recall at index:  0.8166,  missing rate: 0.0000
 recall at label:  0.9218


## Vector/TLSH Comparsion by seach each other

To see whether or not vector/tlsh occours together by searching each other.  (Top and bottom N)

In [39]:
k = 200
D, I = index.search(xb, k) # sanity check
print(I[:2])

[[     0  16621  40668  85151 117504  42881  99714  67487 123632  55801
   97650  69675   1556   2781  97684 122300  21355  40376  14007  78986
   83262  44959  86953  11763 128790   8805  75815 116081  85421  55732
  116232 118391  12636  92715   5733  75061  78558 104323  90970  94497
   14947  68448  97065  17965  32556   7239  68944  83854  52152  87769
   80598  43809 102693  46712  46651  15175 117562 108608  55084  29713
   27660  36071  29519  82274 122987  76150  23909 116935 120115   2775
   32594  88107   6043 105494  49134 115364  53273  41115  14278  67285
    2322  31013 130574  97105  15141  87562 121293  71740   3494  78755
  128484  29030  91916   7965 120871  76204  19831  90452  48491 131330
   25194  75701  33160 103132  58927  66667  77212  30099   6634  64479
    2917   1053  86901  18084  53431 112267 124283 129669  56753  39742
   17571   9414  76597 124927  18117  20455  45493  84817  57445  72750
   20093  10967  15543  20999 111640  70280  84323  71599  16353

In [42]:
query =  xb[[ 0, 16621, 5828, 81481 ]]
D, I = index.search(query, k) # sanity check
print(I[:4])

[[     0  16621  40668  85151 117504  42881  99714  67487 123632  55801
   97650  69675   1556   2781  97684 122300  21355  40376  14007  78986
   83262  44959  86953  11763 128790   8805  75815 116081  85421  55732
  116232 118391  12636  92715   5733  75061  78558 104323  90970  94497
   14947  68448  97065  17965  32556   7239  68944  83854  52152  87769
   80598  43809 102693  46712  46651  15175 117562 108608  55084  29713
   27660  36071  29519  82274 122987  76150  23909 116935 120115   2775
   32594  88107   6043 105494  49134 115364  53273  41115  14278  67285
    2322  31013 130574  97105  15141  87562 121293  71740   3494  78755
  128484  29030  91916   7965 120871  76204  19831  90452  48491 131330
   25194  75701  33160 103132  58927  66667  77212  30099   6634  64479
    2917   1053  86901  18084  53431 112267 124283 129669  56753  39742
   17571   9414  76597 124927  18117  20455  45493  84817  57445  72750
   20093  10967  15543  20999 111640  70280  84323  71599  16353