Specify your GCP Project ID and Bucket Name. 

You must create GCP Project ahead of this step.

In [0]:
project_id = 'Your_project_ID_here'
bucket_name = 'Your_bucket_name_here'

## Download & extract dataset
We use [COCO dataset](http://cocodataset.org/) in this example.

In [0]:
!wget http://images.cocodataset.org/zips/val2017.zip

In [0]:
!unzip -n -q val2017.zip

Count files. It should be 5000.

In [0]:
import os

path, dirs, files = next(os.walk("./val2017"))
len(files)

Let's check if we can read the image

In [0]:
import random
fname = random.sample(os.listdir("val2017/"), k=1)[0]

In [0]:
from PIL import Image
im = Image.open('./val2017/' + fname).convert('RGB').resize((224, 224))
im

## Download tflite

In [0]:
!wget https://github.com/google-coral/edgetpu/raw/master/test_data/mobilenet_v1_1.0_224_quant_embedding_extractor.tflite

## Install requrements

In [0]:
!pip3 install pybind11 numpy setuptools tqdm hnswlib==0.3.4

## Extract features from images

In [0]:
import tensorflow as tf
import numpy as np

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="mobilenet_v1_1.0_224_quant_embedding_extractor.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print(input_details)

def extract_feature(img):
  input_shape = input_details[0]['shape']
  input_data = np.array(img)
  input_data = np.expand_dims(input_data, axis=0)
  interpreter.set_tensor(input_details[0]['index'], input_data)

  interpreter.invoke()

  output_data = interpreter.get_tensor(output_details[0]['index'])
  output_data = output_data.reshape((1, 1024))
  return output_data

# Test model.
extract_feature(im)

#### Iterate for all images
Note that this takes several minutes

In [0]:
from tqdm import tqdm

features = np.zeros((5000, 1024), np.uint8)
directory = "val2017/"
filenames = []

for index, fname in tqdm(enumerate(os.listdir(directory))):
  im = Image.open(directory + fname).convert('RGB').resize((224, 224))
  feature = extract_feature(im)
  features[index] = feature
  filenames.append(fname)

features.shape

#### Save filename list

In [0]:
import pickle

with open('file_names.pkl', 'wb') as f:
  pickle.dump(filenames, f)

#### Generate index for fast approximate nearest neighbor search

In [0]:
import hnswlib

dim = features.shape[1]
num_elements = features.shape[0]

# Generating sample data
data_labels = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(features, data_labels)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Save index
p.save_index('index_5k.bin')

In [0]:
!ls -la

## Upload files to Google Cloud Storage

#### Authenticate GCP user

In [0]:
from google.colab import auth
auth.authenticate_user()

#### Set Project ID

In [0]:
!gcloud config set project {project_id}

#### Create GCS Bucket
if you have already created bucket, you can skip this step.

In [0]:
!gsutil mb -l asia-northeast1 gs://{bucket_name}

#### Copy images to GCS
These files are served when 

In [0]:
!gsutil -m rsync val2017 gs://{bucket_name}/images

#### Make images public
This step makes all images under `/images` readable to everyone.

In [0]:
!gsutil -m acl -r ch -u AllUsers:R gs://{bucket_name}/images

#### Copy index file to GCS
This file will be used in Cloud Run.

In [0]:
!gsutil cp index_5k.bin gs://{bucket_name}/
!gsutil cp file_names.pkl gs://{bucket_name}/