# ScaNNで探索を行うため、イラスト顔ベクトルの索引を作る

In [None]:
! pip install scann

In [None]:
! wget https://github.com/kosuke1701/illust-search/releases/download/0.0/vectors.sql

In [None]:
import io
import sqlite3

import numpy as np
import scann

In [None]:
# Following codes which define `array` type in sqlite3 is copied from the following Stack Overflow:
# https://stackoverflow.com/questions/18621513
# question by:
# Joe Flip (https://stackoverflow.com/users/1715453/joe-flip)
# answered by:
# unutbu (https://stackoverflow.com/users/190597/unutbu)
def adapt_array(arr):
    """
    http://stackoverflow.com/a/31312102/190597 (SoulNibbler)
    """
    out = io.BytesIO()
    np.save(out, arr)
    out.seek(0)
    return sqlite3.Binary(out.read())
def convert_array(text):
    out = io.BytesIO(text)
    out.seek(0)
    return np.load(out)
# Converts np.array to TEXT when inserting
sqlite3.register_adapter(np.ndarray, adapt_array)
# Converts TEXT to np.array when selecting
sqlite3.register_converter("array", convert_array)

In [None]:
# Load database
dim = 500
data_dtype = [("id", int), ("face", int), ("xmin", int), ("xmax", int), \
              ("ymin", int), ("ymax", int), ("vector", np.float32, dim)]

with sqlite3.connect("vectors.sql", detect_types=sqlite3.PARSE_DECLTYPES) as conn:
    c = conn.cursor()

    c.execute("SELECT * FROM face ORDER BY id")
    _data = c.fetchall()
    data = np.array(_data, dtype=data_dtype)

In [None]:
# 索引の作成（1-2時間ほどかかる）
searcher = scann.scann_ops_pybind.builder(data["vector"], 10, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

In [None]:
query = data["vector"][400]
neighbors, distances = searcher.search(query, final_num_neighbors=20)

print(np.dot(query, query))
print(np.sum(query[None,:] * data["vector"][neighbors], axis=1))

## 保存

必要に応じて作成した索引を外部に保存する。

In [None]:
! mkdir scann_save_dir

In [None]:
# 索引を保存する
searcher.serialize("scann_save_dir")
saved_searcher = scann.scann_ops_pybind.load_searcher("scann_save_dir")

In [None]:
# 保存できているか確認
query = data["vector"][400]
neighbors, distances = saved_searcher.search(query, final_num_neighbors=20)

print(np.dot(query, query))
print(np.sum(query[None,:] * data["vector"][neighbors], axis=1))

In [None]:
! zip -r scann_index.zip scann_save_dir

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! cp -r scann_save_dir "/content/drive/My Drive"