# IndexBinaryFlat
The "flat" binary index performs an exhaustive search.

The exhaustive search is carefully optimized especially for 256-bit vectors that are quite common. The Hamming distance computations are optimized using popcount CPU instructions.

Batching is applied on the query and the database side to avoid cache misses.

The values of hamming_batch_size and faiss::IndexBinaryFlat#query_batch_size can be customized to adjust the batch sizes but the default values were found to be close to optimal for a large range of settings.

https://github.com/facebookresearch/faiss/wiki/Binary-indexes

In [1]:
import faiss                   
import numpy as np
import pandas as pd
import time
import tlsh
from tools import *

In [15]:
nb = 100
d = 256

db = np.empty((nb, d // 8), dtype='uint8')
print(db.shape)
print(db[:1])

nq = 100
queries = np.empty((nq, d // 8), dtype='uint8')

# Initializing index.
index = faiss.IndexBinaryFlat(d)

# Adding the database vectors.
index.add(db)

# Number of nearest neighbors to retrieve per query vector.
k = 3
# Querying the index
D, I = index.search(db, k)  #sanity test
print(I[:5])
print(D[:5])
# D, I = inde
# D, I = index.search(queries, k)

(100, 32)
[[ 80 194  43  41  40 127   0   0  80 194  43  41  40 127   0   0 128  99
   88 146 120  85   0   0 128  99  88 146 120  85   0   0]]
[[ 0  4 96]
 [ 1  4 13]
 [ 2 14  5]
 [ 3 15  6]
 [ 4  1 16]]
[[ 0 73 74]
 [ 0 21 35]
 [ 0 24 26]
 [ 0 28 28]
 [ 0 21 27]]


In [18]:
nb = 100
d = 64

db = np.empty((nb, d//8), dtype='uint8')
print(db.shape)
print(db[:1])

# Initializing index.
index = faiss.IndexBinaryFlat(d)

# Adding the database vectors.
index.add(db)

# Number of nearest neighbors to retrieve per query vector.
k = 3
# Querying the index
D, I = index.search(db, k)  #sanity test
print(I[:5])
print(D[:5])
# D, I = inde
# D, I = index.search(queries, k)

(100, 8)
[[224  93  67 146 120  85   0   0]]
[[ 0  4  1]
 [ 1  5  4]
 [ 2  4  1]
 [ 3 75 76]
 [ 4  2  1]]
[[ 0 20 22]
 [ 0  0  4]
 [ 0  3  7]
 [ 0 24 25]
 [ 0  3  4]]


# Below are IndexLSH

# Dummy Data Testing

In [145]:
import secrets
def gen_data(N):
    hash_list = []
    for i in range(N):
        hash = secrets.token_hex(int(d/2))
        hash_list.append(convert_to_array(hash))
    return np.array(hash_list).astype('float32')
  
xb = gen_data(10000, d)
xb[:1]

array([[ 4., 11.,  6.,  1.,  2., 15.,  9.,  3.,  8.,  3.,  9., 13.,  3.,
        11., 12.,  8.,  1., 15.,  8.,  1., 15.,  6., 11.,  5.,  5., 13.,
         0.,  0., 14., 10.,  8.,  0.,  4.,  7., 13., 11.,  1., 11.,  6.,
        13.,  6.,  0.,  3.,  1.,  9., 13.,  2.,  8.,  5.,  0.,  3., 15.,
        12.,  1.,  3.,  8.,  2.,  2.,  2.,  5., 11., 15.,  8., 13.,  9.,
         2., 11.,  9.,  6., 11.]], dtype=float32)

In [None]:
d = 70                         # dimension
n_bits = 2*d 

In [148]:
index = faiss.IndexLSH(d, n_bits)   # build the index
# index.train(xb)
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
# print(D)

[[ 0 80 49]
 [ 1 41 32]
 [ 2 32 25]
 [ 3 20 21]
 [ 4 92 41]]


In [149]:
evaluate(I, pd.DataFrame(xb))

dataframe shape : 100,70
 recall at 1:  1.0000,  missing rate: 0.0000


In [152]:
xb = (xb/15)

index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
# print(D)

[[ 0 80 49]
 [ 1 41 32]
 [ 2 32 25]
 [ 3 20 21]
 [ 4 92 41]]


In [153]:
evaluate(I, pd.DataFrame(xb))

dataframe shape : 100,70
 recall at 1:  1.0000,  missing rate: 0.0000


# Malware Data Testing

In [154]:
df=pd.read_csv("malware_bazaar.csv",header=0)
print(df.shape)
df[:1]

(132134, 3)


Unnamed: 0,sha1_hash,tlsh,signature
0,003411d0a9610cfe8a027a364b46c489fa034502,AF74AD89B6257A65DE3A727411C78FC1B994D007602253...,Quakbot


## 70 Hex chars as Input

In [156]:
xb = df['tlsh'].apply(convert_to_array)
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

(132134, 70)


In [157]:
index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

[[ 1053  1190     0]
 [    1 24118 35005]
 [    2 89364  3926]
 [10600 11197     3]
 [18609 19442     4]]


In [158]:
evaluate(I, pd.DataFrame(xb))

dataframe shape : 132134,70
 recall at 1:  0.7907,  missing rate: 0.0000


## 140 Buckets as Input (Header + Body)

In [211]:
def to_hist(t, length):
    buckets = np.zeros(length, dtype=np.int32)
    for idx, (sixteen, one) in enumerate(zip(t[0::2], t[1::2])):
        buckets[4 * idx] = (int(sixteen,16) & 0x0C) >> 2
        buckets[4 * idx + 1] = (int(sixteen,16) & 0x03)
        buckets[4 * idx + 2] = (int(one,16) & 0x0C) >> 2
        buckets[4 * idx + 3] = (int(one,16) & 0x03)
    return buckets

In [219]:
xb = df['tlsh'].apply(to_hist, args=(140,))
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

d = 140                         # dimension
n_bits = 2*d 

index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
evaluate(I, pd.DataFrame(xb))

(132134, 140)
[[     0 127168   2781]
 [     1  24118  67562]
 [     2  25683  53636]
 [     3 110150  11197]
 [     4  45416  19442]]
dataframe shape : 132134,140
 recall at 1:  0.9477,  missing rate: 0.0000
