# Binary LSH codes
In Faiss, the IndedLSH is just a Flat index with binary codes. The database vectors and query vectors are hashed into binary codes that are compared with Hamming distances.

https://github.com/facebookresearch/faiss/wiki/Faiss-indexes

In [1]:
import faiss                   
import numpy as np
import pandas as pd
import time
import tlsh
from tools import *

## Dummy Data Testing

In [2]:
d = 70                         # dimension
n_bits = 2*d 
k = 3                          # we want to see K nearest neighbors

In [3]:
xb = gen_data(10000, d)
xb[:1]

array([[ 0.,  0.,  2.,  9., 13., 15., 13.,  9.,  4.,  2.,  9., 14.,  9.,
        10.,  6., 11.,  2.,  8., 11.,  8., 12.,  9.,  3.,  0., 13.,  4.,
         9.,  9., 10.,  8.,  8.,  3.,  6.,  3., 10.,  5.,  6.,  7.,  5.,
        12.,  5., 10.,  7.,  5., 15.,  7.,  1.,  4.,  8.,  1.,  9.,  3.,
         0.,  6.,  2., 13.,  4., 13., 12., 12.,  8.,  9., 10.,  0.,  3.,
        12.,  5.,  9.,  5.,  0.]], dtype=float32)

In [4]:
index = faiss.IndexLSH(d, n_bits)   # build the index
# index.train(xb)
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

[[   0 5456 4563]
 [   1 8159 2763]
 [   2 6386 6294]
 [   3 6070 1246]
 [   4  115 6466]]


In [5]:
evaluate(I, pd.DataFrame(xb), k)

 recall at index:  1.0000,  missing rate: 0.0000


In [6]:
xb = (xb/15)

index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

[[   0 5456 4563]
 [   1 8159 2763]
 [   2 6386 6294]
 [   3 6070 1246]
 [   4  115 6466]]


In [7]:
evaluate(I, pd.DataFrame(xb), k)

 recall at index:  1.0000,  missing rate: 0.0000


## Malware Data Testing

In [4]:
df=pd.read_csv("../data/malware_bazaar.csv",header=0)
print(df.shape)
df[:1]

(132134, 3)


Unnamed: 0,sha1_hash,tlsh,signature
0,003411d0a9610cfe8a027a364b46c489fa034502,AF74AD89B6257A65DE3A727411C78FC1B994D007602253...,Quakbot


### 70 Hex chars as Input

In [9]:
xb = df['tlsh'].apply(convert_to_array)
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

(132134, 70)


In [10]:
index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])

[[ 1053  1190     0]
 [    1 24118 35005]
 [    2 89364  3926]
 [10600 11197     3]
 [18609 19442     4]]


In [11]:
evaluate(I, df, k, True)

 recall at index:  0.7907,  missing rate: 0.0000
 recall at label:  0.9996


### 128 Buckets as Input (TLSH Body)

In [12]:
xb = df['tlsh'].str.slice(start=6).apply(to_hist, args=(128,))
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

d = 128                         # dimension
n_bits = 2*d 
index = faiss.IndexLSH(d, n_bits*4)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
evaluate(I, df, k, True)

(132134, 128)
[[   52   395     0]
 [    1 54676 24118]
 [ 4904 12208     2]
 [  561   630     3]
 [    4   239   102]]
 recall at index:  0.5395,  missing rate: 0.0000
 recall at label:  0.9986


### 140 Buckets as Input (Header + Body)

In [5]:
xb = df['tlsh'].apply(to_hist, args=(140,))
xb = np.array(xb.tolist()).astype('float32')
print(xb.shape)

(132134, 140)


In [13]:
d = 140                         # dimension
n_bits = 2*d 

index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
evaluate(I, df, k, True)

(132134, 140)
[[     0 127168   2781]
 [     1  24118  67562]
 [     2  25683  53636]
 [     3 110150  11197]
 [     4  45416  19442]]
 recall at index:  0.9477,  missing rate: 0.0000
 recall at label:  1.0000


**BEST SOLUTION**
n_bits = 4*d  

In [6]:
d = 140                         # dimension
n_bits = 4*d                    # as each bucket values is between 0-3, so assign 4 bits for ecah bucket/dinmension

index = faiss.IndexLSH(d, n_bits)   # build the index
index.add(xb)                  # add vectors to the index
D, I = index.search(xb, k) # sanity check
print(I[:5])
evaluate(I, df, k, True)

[[     0   2781   1556]
 [     1  24118  25610]
 [     2  70506  30084]
 [110150 115061      3]
 [ 19442  45416      4]]
 recall at index:  0.9541,  missing rate: 0.0000
 recall at label:  1.0000


## Vector/TLSH Comparsion by seach each other

To see whether or not vector/tlsh occours together by searching each other.  (Top and bottom N)

In [7]:
k = 200
D, I = index.search(xb, k) # sanity check
print(I[:2])

[[     0   2781   1556  42881  97650  55732  11763 118391  55084  92715
   86953 116081  46712  99714  75815  16621  97065  14007 128790  85151
   83262  67487  55801  68448 122987  14947  78558   5733  90970  78986
   71740  40668  97105  91916 117504  15141 123632  97684  69675  80598
  120115 116935 102693 120990 115364  43809  46651  88107  78339  22511
  117562  32594  20455  27660  12636  68944   6546  51119  76204  81887
   85421  31013 108608 105494 131330  85029  98965  18117  50070  72027
   40376  31862  95600  19831 103480   1370  27123   6188  83854  78755
   44545  23909 116232  86901 109866  87562 104323   2775   6634  18084
  122300  72750   7965  17822 128484  85232  69554  30099  93598   6484
    8054 100341  64479 124927  38021 127168  75061  71267  78532  32556
   44959   1712  56139  16353  54738  29030  25067   9414 104290  45484
   86123  10967  45232  57445  41530  70987  19665  48491  77953 118303
    8038  56732  13188  40033  18749  67939 115268  29478  45877

In [8]:
query =  xb[[ 0, 2781, 110095 , 96159 ]]
D, I = index.search(query, k) # sanity check
print(I[:4])

[[     0   2781   1556  42881  97650  55732  11763 118391  55084  92715
   86953 116081  46712  99714  75815  16621  97065  14007 128790  85151
   83262  67487  55801  68448 122987  14947  78558   5733  90970  78986
   71740  40668  97105  91916 117504  15141 123632  97684  69675  80598
  120115 116935 102693 120990 115364  43809  46651  88107  78339  22511
  117562  32594  20455  27660  12636  68944   6546  51119  76204  81887
   85421  31013 108608 105494 131330  85029  98965  18117  50070  72027
   40376  31862  95600  19831 103480   1370  27123   6188  83854  78755
   44545  23909 116232  86901 109866  87562 104323   2775   6634  18084
  122300  72750   7965  17822 128484  85232  69554  30099  93598   6484
    8054 100341  64479 124927  38021 127168  75061  71267  78532  32556
   44959   1712  56139  16353  54738  29030  25067   9414 104290  45484
   86123  10967  45232  57445  41530  70987  19665  48491  77953 118303
    8038  56732  13188  40033  18749  67939 115268  29478  45877

# Potential Cluster Quality Measurement

In [16]:
# homogeneity
# https://github.trendmicro.com/muqeeta/Fuzzy_research/blob/master/tlsh_analysis_notebooks/tlsh_analysis_forest_clustering.ipynb

In [17]:
# purity
# https://github.trendmicro.com/muqeeta/Fuzzy_research/blob/master/tlsh_analysis_notebooks/tlsh_analysis_malware_family.ipynb

In [18]:
# silhouette_score
#https://github.trendmicro.com/muqeeta/Fuzzy_research/blob/master/KMeans/KMeans_MP.py