<a href="https://colab.research.google.com/github/pertvirt/hello_world/blob/master/data_downloader_image_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import**

In [0]:

# For downloading dataset
from urllib.request import urlretrieve
import os

# For extracting dataset
import tarfile

# For reading images
import cv2

# Essentials :)
import numpy as np

# pretty printing python objects
import pprint

# for sorting dictionary by value
import operator

# for showing images inline
from matplotlib.pyplot import imshow 
%matplotlib inline 

# for making labels one-hot encoded
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# for splitting data into training and validation data
from sklearn.model_selection import train_test_split

# for CNN and NN models
from keras.models import Sequential, Model
from keras.layers import Conv2D, Input, Dropout, Activation, Dense, MaxPooling2D, Flatten, GlobalAveragePooling2D
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model

# For transfer learning
from keras.applications.inception_v3 import InceptionV3

# to save models
import json

# for saving environment of notebook
import dill

# for printing size each variable is using
import sys


In [4]:
URL_CALTECH_101_DATA = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz'

def download_dataset(url):
    current_directory = os.path.dirname(os.path.realpath('__file__'))
    dataset_file_path = current_directory+"/dataset.tgz"
    if os.path.exists(dataset_file_path):
        print("Already downloaded.")
    else:
        filename, headers = urlretrieve(url, dataset_file_path)        
    print("Done")

download_dataset(URL_CALTECH_101_DATA)

Done


In [5]:
def extract_dataset(dataset_file_path, extraction_directory):
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)
    if (dataset_file_path.endswith("tar.gz") or dataset_file_path.endswith(".tgz")):
        tar = tarfile.open(dataset_file_path, "r:gz")
        tar.extractall(path=extraction_directory)
        tar.close()
    elif (dataset_file_path.endswith("tar")):
        tar = tarfile.open(dataset_file_path, "r:")
        tar.extractall(path=extraction_directory)
        tar.close()
    print("Done")

extract_dataset('./dataset.tgz','./data/')

Done


In [0]:
def return_images_per_category(data_directory):
    categories = os.listdir(data_directory+"/101_ObjectCategories/")
    object_images_count_dict = {}
    for category in categories:
        object_images_count_dict[category] = len(os.listdir(data_directory+"/101_ObjectCategories/"+category))
    object_images_count_dict = sorted(object_images_count_dict.items(), key=operator.itemgetter(1), reverse=True)
    return object_images_count_dict

return_images_per_category('./data')

In [7]:
total_count = 0
for category,count in return_images_per_category('./data'):
    total_count += count
print("Total number of images in training data : ",total_count)

Total number of images in training data :  9145


In [8]:
total_count

9145

In [10]:
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/cc/66/eab272ae940d36d698994058e303fe7d1264d10ec120e0a508d0c8fb3ca5/annoy-1.16.2.tar.gz (636kB)
[K     |▌                               | 10kB 19.0MB/s eta 0:00:01[K     |█                               | 20kB 4.2MB/s eta 0:00:01[K     |█▌                              | 30kB 6.0MB/s eta 0:00:01[K     |██                              | 40kB 7.7MB/s eta 0:00:01[K     |██▋                             | 51kB 5.0MB/s eta 0:00:01[K     |███                             | 61kB 5.9MB/s eta 0:00:01[K     |███▋                            | 71kB 6.7MB/s eta 0:00:01[K     |████▏                           | 81kB 7.5MB/s eta 0:00:01[K     |████▋                           | 92kB 8.2MB/s eta 0:00:01[K     |█████▏                          | 102kB 6.6MB/s eta 0:00:01[K     |█████▋                          | 112kB 6.6MB/s eta 0:00:01[K     |██████▏                         | 122kB 6.6MB/s eta 0:00:01[K    

In [12]:
import numpy as np
import glob

from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.models import Model

# from google.colab import drive
# drive.mount('./gdrive')

# !pip install annoy
from annoy import AnnoyIndex

IMAGE_BASE_PATH = "data"
ANNOY_MODEL_PATH = "models/celtech.ann"
ANNOY_DIMENTION = 4096

# VGG19から中間層を抽出
base_model = VGG16(weights="imagenet")
model = Model(inputs=base_model.input, outputs=base_model.get_layer("fc2").output)

# Annoyのモデルを構築
annoy_model = AnnoyIndex(ANNOY_DIMENTION)

# 画像をベクトルに変換してAnnoyに登録

all_files = glob.glob(IMAGE_BASE_PATH + "/**/*.jpg", recursive=True)
# len(all_files)

for i, path in enumerate(all_files):
    # img_path = IMAGE_BASE_PATH + "ramen" + str(i) + ".jpg"
    img = image.load_img(path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    fc2_features = model.predict(x)

    annoy_model.add_item(i, fc2_features[0])
    print(i, path, "Done!")

annoy_model.build(len(all_files))
annoy_model.save(ANNOY_MODEL_PATH)



0 data/101_ObjectCategories/ketch/image_0088.jpg Done!
1 data/101_ObjectCategories/ketch/image_0080.jpg Done!
2 data/101_ObjectCategories/ketch/image_0044.jpg Done!
3 data/101_ObjectCategories/ketch/image_0076.jpg Done!
4 data/101_ObjectCategories/ketch/image_0009.jpg Done!
5 data/101_ObjectCategories/ketch/image_0109.jpg Done!
6 data/101_ObjectCategories/ketch/image_0040.jpg Done!
7 data/101_ObjectCategories/ketch/image_0097.jpg Done!
8 data/101_ObjectCategories/ketch/image_0091.jpg Done!
9 data/101_ObjectCategories/ketch/image_0099.jpg Done!
10 data/101_ObjectCategories/ketch/image_0013.jpg Done!
11 data/101_ObjectCategories/ketch/image_0061.jpg Done!
12 data/101_ObjectCategories/ketch/image_0043.jpg Done!
13 data/101_ObjectCategories/ketch/image_0028.jpg Done!
14 data/101_ObjectCategories/ketch/image_0055.jpg Done!
15 data/101_ObjectCategories/ketch/image_0045.jpg Done!
16 data/101_ObjectCategories/ketch/image_0038.jpg Done!
17 data/101_ObjectCategories/ketch/image_0063.jpg Done!
18

True

In [20]:
import os
import numpy as np
import glob

from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.models import Model

from annoy import AnnoyIndex

IMAGE_BASE_PATH = "data"
ANNOY_MODEL_PATH = "models/celtech.ann"
ANNOY_DIMENTION = 4096
SEARCH_IMAGE_PATH = "/content/hai-quan-viet-nam-huan-luyen-bang-thuyen-buom-loi-ich-kho-tin.jpg"

# [/28, 118, 181, 69, 273]

#
# VGG19から中間層を抽出
base_model = VGG16(weights="imagenet")
model = Model(inputs=base_model.input, outputs=base_model.get_layer("fc2").output)

all_files = glob.glob(IMAGE_BASE_PATH + "/**/*.jpg", recursive=True)

# Annoyのモデルを構築
loaded_model = AnnoyIndex(ANNOY_DIMENTION)
loaded_model.load(ANNOY_MODEL_PATH)

# 検索対象の画像をロードして、ベクトルに変換
img_path = SEARCH_IMAGE_PATH
img = image.load_img(img_path, target_size=(224, 224))

x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

fc2_features = model.predict(x)

# Annoyで検索
items = loaded_model.get_nns_by_vector(fc2_features[0], 5, search_k=-1, include_distances=False)
print(items)
for i in range(0, len(items)):
  print(all_files[items[i]])



[92, 56, 5, 52, 77]
data/101_ObjectCategories/ketch/image_0041.jpg
data/101_ObjectCategories/ketch/image_0054.jpg
data/101_ObjectCategories/ketch/image_0109.jpg
data/101_ObjectCategories/ketch/image_0035.jpg
data/101_ObjectCategories/ketch/image_0004.jpg
