<a href="https://colab.research.google.com/github/kaantopcu/Reverse-Image-Search/blob/main/notebooks/Caltech101_Data_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
import numpy as np
from numpy.linalg import norm
import pickle
import time
from tqdm import tqdm, tqdm_notebook
import torchvision.datasets as datasets
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from keras.models import Model
from keras.layers import GlobalAveragePooling2D
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random
from sklearn.decomposition import PCA
import gdown
import shutil
from sklearn.manifold import TSNE

In [None]:
# Define the relative path
relative_path = 'datasets'

# Get the absolute path by joining the current working directory with the relative path
absolute_path = os.path.join(os.getcwd(), relative_path)

# Dataset Download

## Google Colab

In [None]:
!mkdir -p {absolute_path}
!pip install gdown

# Download the dataset
!gdown https://drive.google.com/uc?id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp --output {absolute_path}/caltech101.tar.gz

# Extract the dataset
!tar -xvzf {absolute_path}/caltech101.tar.gz -C {absolute_path}

# Rename the directory
!mv {absolute_path}/101_ObjectCategories {absolute_path}/caltech101

# Remove unwanted directory
!rm -rf {absolute_path}/caltech101/BACKGROUND_Google

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
101_ObjectCategories/chair/image_0005.jpg
101_ObjectCategories/chair/image_0006.jpg
101_ObjectCategories/chair/image_0007.jpg
101_ObjectCategories/chair/image_0008.jpg
101_ObjectCategories/chair/image_0010.jpg
101_ObjectCategories/chair/image_0011.jpg
101_ObjectCategories/chair/image_0012.jpg
101_ObjectCategories/chair/image_0013.jpg
101_ObjectCategories/chair/image_0014.jpg
101_ObjectCategories/chair/image_0016.jpg
101_ObjectCategories/chair/image_0017.jpg
101_ObjectCategories/chair/image_0018.jpg
101_ObjectCategories/chair/image_0019.jpg
101_ObjectCategories/chair/image_0020.jpg
101_ObjectCategories/chair/image_0022.jpg
101_ObjectCategories/chair/image_0023.jpg
101_ObjectCategories/chair/image_0024.jpg
101_ObjectCategories/chair/image_0025.jpg
101_ObjectCategories/chair/image_0026.jpg
101_ObjectCategories/chair/image_0028.jpg
101_ObjectCategories/chair/image_0029.jpg
101_ObjectCategories/chair/image_0030.jpg
101_ObjectC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
destination_folder = "/content/drive/MyDrive/DataProjects/Reverse_Image_Search"

In [None]:
!mv {absolute_path}/caltech101 {destination_folder}

## Amazon Sagemaker Studio Lab

In [None]:
# Specify the root directory where the dataset will be downloaded
root_dir = "/content/caltech_101"

try:
    # Try to download the Caltech 101 dataset
    caltech101_dataset = datasets.Caltech101(root=root_dir, download=True)
except FileExistsError:
    # If the dataset is already downloaded and verified, don't do anything
    print("Caltech 101 dataset already exists at", root_dir)

'\n\n# Specify the root directory where the dataset will be downloaded\nroot_dir = "/content/caltech_101"\n\ntry:\n    # Try to download the Caltech 101 dataset\n    caltech101_dataset = datasets.Caltech101(root=root_dir, download=True)\nexcept FileExistsError:\n    # If the dataset is already downloaded and verified, don\'t do anything\n    print("Caltech 101 dataset already exists at", root_dir)\n\n'

In [None]:
# Define the directory to be removed

root_dir = "./caltech_101/caltech101/101_ObjectCategories"

dir_to_remove = os.path.join(root_dir,"BACKGROUND_Google")

# Check if the directory exists before attempting to remove it
if os.path.exists(dir_to_remove):
    # Use shutil.rmtree to remove the directory and its contents recursively
    shutil.rmtree(dir_to_remove)
    print(f"Directory '{dir_to_remove}' and its contents have been successfully removed.")
else:
    print(f"Directory '{dir_to_remove}' does not exist.")

# Feature Extraction

In [None]:
datagen = image.ImageDataGenerator(preprocessing_function=preprocess_input)

root_dir = os.path.join(absolute_path, 'caltech101')

generator = datagen.flow_from_directory(root_dir,
                                        target_size=(224, 224),
                                        class_mode=None,
                                        shuffle=False)

Found 8677 images belonging to 101 classes.


In [None]:
model = ResNet50(weights='imagenet',
                 include_top=False,
                 pooling = "avg",
                 input_shape=(224, 224, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                       

In [None]:
def extract_features(img_path, model):
    input_shape = (224, 224, 3)
    img = image.load_img(img_path, target_size=(input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)

    flattened_features = features.flatten()

    normalized_features = flattened_features / norm(flattened_features)

    return normalized_features

In [None]:
sample_image = os.path.join(absolute_path, 'caltech101/Faces/image_0001.jpg')
features = extract_features(sample_image,model)
print(len(features))
features

2048


array([0.00742651, 0.0040505 , 0.00421501, ..., 0.08466108, 0.01898376,
       0.03370329], dtype=float32)

In [None]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']

In [None]:
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                file_list.append(os.path.join(root, filename))
                counter += 1
    return file_list

In [None]:
#root_dir = "./caltech_101"
filenames = sorted(get_file_list(root_dir))
filenames

['/content/datasets/caltech101/Faces/image_0001.jpg',
 '/content/datasets/caltech101/Faces/image_0002.jpg',
 '/content/datasets/caltech101/Faces/image_0003.jpg',
 '/content/datasets/caltech101/Faces/image_0004.jpg',
 '/content/datasets/caltech101/Faces/image_0005.jpg',
 '/content/datasets/caltech101/Faces/image_0006.jpg',
 '/content/datasets/caltech101/Faces/image_0007.jpg',
 '/content/datasets/caltech101/Faces/image_0008.jpg',
 '/content/datasets/caltech101/Faces/image_0009.jpg',
 '/content/datasets/caltech101/Faces/image_0010.jpg',
 '/content/datasets/caltech101/Faces/image_0011.jpg',
 '/content/datasets/caltech101/Faces/image_0012.jpg',
 '/content/datasets/caltech101/Faces/image_0013.jpg',
 '/content/datasets/caltech101/Faces/image_0014.jpg',
 '/content/datasets/caltech101/Faces/image_0015.jpg',
 '/content/datasets/caltech101/Faces/image_0016.jpg',
 '/content/datasets/caltech101/Faces/image_0017.jpg',
 '/content/datasets/caltech101/Faces/image_0018.jpg',
 '/content/datasets/caltech1

In [None]:
feature_list = []
for i in tqdm_notebook(range(len(filenames))):
    feature_list.append(extract_features(filenames[i],model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(len(filenames))):


  0%|          | 0/8677 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
print("total images:",len(feature_list))
print("features in images:",len(feature_list[0]))

In [None]:
pickle.dump(feature_list, open("./features-caltech101-resnet50.pickle",'wb'))
#pickle.dump(filenames, open("./filenames-caltech101.pickle",'wb'))
pickle.dump(generator.classes, open('./class_ids-caltech101.pickle','wb'))

In [None]:
# Push them to drive

# Delete existing files in the destination folder
existing_files = [
    'features-caltech101-resnet50.pickle',
    'filenames-caltech101.pickle',
    'class_ids-caltech101.pickle'
]

for filename in existing_files:
    file_path = os.path.join(destination_folder, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

# Move files to the destination folder
shutil.move('features-caltech101-resnet50.pickle', destination_folder)
#shutil.move('filenames-caltech101.pickle', destination_folder)
shutil.move('class_ids-caltech101.pickle', destination_folder)

'/content/drive/MyDrive/DataProjects/Reverse_Image_Search/class_ids-caltech101.pickle'