In [42]:
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append("./CLIP")
import os
import clip
import torch
from PIL import Image
from IPython.display import Image as ImageDisplay
from utils import load_pickle, save_obj_as_pickle
import numpy as np
from matplotlib.image import imread
from matplotlib.pyplot import figure, imshow, axis
from jupyter_utils import plot_time_jupyter, plot_scores_jupyter
import time

device = "cuda" if torch.cuda.is_available() else "cpu"

# Below cell contains several datasets that I have prepared for you. These include:
- Smallest (for debug purpose only, since this dataset is too small to retrieve anything meaningful)
- Medium (2740000 images, you may try your prompts with this dataset first, then switch to the Large one)
- Large (7850000 images, the one we used in CLEAR paper
- Even Larger (??, I am still downloading more images, stay tuned)

# I also extracted CLIP features using 3 models (ranked by model sizes, larger models may perform better):
- ResNet50 (RN50)
- ResNet101 (RN101)
- ResNet50x4 (RN50x4)

In [43]:
from prepare_dataset import get_knearest_models_func, load_bucket_dict
import faiss_utils
from faiss_utils import KNearestFaissFeatureChunks

# Smallest size + shorter edge larger than 120px + max aspect ratio smaller than 2
SMALLEST_DATASET = "/scratch/zhiqiu/yfcc100m_all_new/images_minbyte_10_valid_uploaded_date_minedge_120_maxratio_2.0/"
# Medium size 2740000
MEDIUM_DATASET = "/scratch/zhiqiu/yfcc100m_all/images_minbyte_10_valid_uploaded_date_jan_31/" 
# Large size 7850000 (the one we used in CLEAR paper)
LARGE_DATASET = "/scratch/zhiqiu/yfcc100m_all/images_minbyte_10_valid_uploaded_date_feb_18/" 

In [44]:
# Make a cell of customary concept group. Then save this concept group locally? (User, date, concept lists)
# Prepare a script to collect the concept group.
# Prepare a script to save the concept group locally.
# Teach them about chmod for public access

# Modify the below cell to create your own dataset

In [47]:
# This cell contains a group of visual concepts
concept_group_dict = {
    'USERNAME' : "zhiqiul", # Your username
    'DATE' : "2021-07-28", # Date for reference
    'GROUPNAME' : "CLEAR10", # Change to your own name
#     'GROUPNAME' : "CLEAR10-MEDIUM", # Change to your own name
    'PREFIX' : "", # You can add a prefix to all visual concepts, such as 'a photo of'
    'ALLOW_OVERLAP' : False, # If False, images appear in multiple categories will be removed
    'FOLDER_PATH' : LARGE_DATASET, # The path to the dataset created by prepare_dataset.py
#     'FOLDER_PATH' : MEDIUM_DATASET, # The path to the dataset created by prepare_dataset.py
    'CLIP_MODEL' : 'RN50', # The pre-trained model used for extracting CLIP features
    'NUM_OF_BUCKETS' : 11, # The number of buckets (segments) in the dataset
    'NUM_OF_IMAGES_PER_CLASS_PER_BUCKET' : 600, # The number of images to retrieve per class per bucket
    'NUM_OF_IMAGES_PER_CLASS_PER_BUCKET_TO_QUERY' : 16000, # The number of images to query (this number must be larger than the above).
    'BACKGROUND' : True, # If True, add an additional negative class
    'NEGATIVE_RATIO' : 0.1, # The ratio of negative samples per class to keep
    'SAVE_PATH' : "/data3/zhiqiul/clear_datasets", # The images will be saved at this path.
    'GROUP' : [
        'laptop',
        'camera',
        'bus',
        'sweater',
        'dress',
        'racing',
        'hockey',
        'cosplay',
        'baseball',
        'soccer',
    ]
}

from prepare_concepts import get_dataset_name, get_save_path, get_concept_group_dict_path, prepare_dataset_folder

# Here I copied all the function definitions in prepare_concepts.py for reference

# def get_dataset_name(concept_group_dict):
#     return "-".join([concept_group_dict['GROUPNAME'], concept_group_dict["USERNAME"], concept_group_dict['DATE']])

# def get_save_path(concept_group_dict):
#     dataset_name = get_dataset_name(concept_group_dict)
#     save_path = os.path.join(concept_group_dict['SAVE_PATH'], dataset_name)
#     return save_path
    
# def get_concept_group_dict_path(concept_group_dict):
#     """Save concept_group_dict at this path
#     """
#     save_path = get_save_path(concept_group_dict)
#     return os.path.join(save_path, "concept_group_dict.pickle")

# def prepare_dataset_folder(concept_group_dict):
#     save_path = get_save_path(concept_group_dict)
#     concept_group_dict_path = get_concept_group_dict_path(concept_group_dict)
#     if os.path.exists(save_path):
#         if not os.path.exists(concept_group_dict_path):
#             print("Missing concept group dict")
#             os.rmdir(save_path)
#             return
#         else:
#             concept_group_dict_saved = load_pickle(concept_group_dict_path)
#             if concept_group_dict_saved == concept_group_dict:
#                 print('Dataset already exists')
#             else:
#                 print(f'Dataset already exists at {save_path} and has conflicting options. Please double check.')
#     else:
#         os.makedirs(save_path)
#         save_obj_as_pickle(concept_group_dict_path, concept_group_dict)
#         for bucket_idx in range(concept_group_dict['NUM_OF_BUCKETS']):
#             for concept in concept_group_dict['GROUP']:
#                 os.makedirs(os.path.join(save_path, str(bucket_idx), concept))
#             if concept_group_dict['BACKGROUND']:
#                 os.makedirs(os.path.join(save_path, str(bucket_idx), 'BACKGROUND'))
#         print(f"Save dataset folder at {save_path}")
            
    
print(f"The identifier (name) of dataset is " + get_dataset_name(concept_group_dict))
print(f"The dataset information will be saved at {get_concept_group_dict_path(concept_group_dict)}")
prepare_dataset_folder(concept_group_dict)

The identifier (name) of dataset is CLEAR10-zhiqiul-2021-07-28
The dataset information will be saved at /data3/zhiqiul/clear_datasets/CLEAR10-zhiqiul-2021-07-28/concept_group_dict.pickle
Dataset already exists


# The below cell will generate a script for you to run to collect the dataset.

In [48]:
def generate_python_script(concept_group_dict):
    python_script = f"python prepare_concepts.py --concept_group_dict {get_concept_group_dict_path(concept_group_dict)}" 
    return python_script

print(generate_python_script(concept_group_dict))

python prepare_concepts.py --concept_group_dict /data3/zhiqiul/clear_datasets/CLEAR10-zhiqiul-2021-07-28/concept_group_dict.pickle


# To download the images (after collection) to local computer:

# To allow others accessing your folder: