In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
import shutil
sys.path.append('../code/')
sys.path.append('../python/')

In [None]:
from pprint import pprint
from os import path
import scipy
import os
from matplotlib import pyplot as plt
from tqdm import tqdm
from argparse import Namespace
import pickle
import seaborn as sns

import torchvision
import torchvision.transforms as transforms

from sklearn.model_selection import train_test_split
# import seaborn as sns
import numpy as np
# import pandas as pd
import scipy
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
from metrics import ranking
# from sh import  sh
import data

In [None]:
def get_numpy_data(dataloader):
    x, y = [], []
    for batch_x, batch_y in tqdm(iter(dataloader)):
        x.append(batch_x.numpy())
        y.append(batch_y.numpy())
    x = np.vstack(x)
    y = np.concatenate(y)
    
    return x, y

def create_hashgan_train_test(x, y, db_size, query_size):
    train_x, query_x, train_y, query_y = train_test_split(x, y, test_size = query_size, stratify = y)
    train_x, db_x, train_y, db_y = train_test_split(train_x, train_y, test_size = db_size, stratify = train_y)
    
    return train_x, train_y, query_x, query_y, db_x, db_y

def create_train_test(x, y, query_size):
    """Train and DB are using the same dataset: gallery"""
    train_x, query_x, train_y, query_y = train_test_split(x, y, test_size = query_size, stratify = y)
    
    return train_x, train_y, query_x, query_y, train_x, train_y

def get_cifar10_data(image_size, batch_size, dataroot='../data/', workers=2, data_transforms=None):
    if data_transforms is None:
        data_transforms = transforms.Compose([
                                transforms.Scale(image_size),
                                transforms.ToTensor()
                                # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                            ])
    train_dataset = dset.CIFAR10(root=dataroot, download=True, train=True, transform=data_transforms)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers)
    test_dataset = dset.CIFAR10(root=dataroot, download=True, train=False, transform=data_transforms)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers)
    
    return train_dataloader, test_dataloader

def get_places365_dataloaders(image_size, batch_size, dataroot, workers=2, data_transforms=None):
    if data_transforms is None:
        data_transforms = transforms.Compose([
                                transforms.Resize(image_size),
                                transforms.ToTensor()
                            ])
        
    train_dataloader = torch.utils.data.DataLoader(dset.ImageFolder(
                                                        root=path.join(dataroot, 'train'),
                                                        transform=data_transforms
                                                    ), 
                                                   batch_size=batch_size, shuffle=False, num_workers=workers)
    
    valid_dataloader = torch.utils.data.DataLoader(dset.ImageFolder(
                                                        root=path.join(dataroot, 'val'),
                                                        transform=data_transforms
                                                    ), 
                                                   batch_size=batch_size, shuffle=False, num_workers=workers)
        
    
    
    return train_dataloader, valid_dataloader

def get_mnist_data(image_size, batch_size, dataroot='../data/', workers=2, data_transforms=None):
    if data_transforms is None:
        data_transforms = transforms.Compose([
                                transforms.Scale(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, ), (0.5, )),
                            ])
    train_dataset = dset.MNIST(root=dataroot, download=True, train=True, transform=data_transforms)
    train_x, train_y = get_numpy_data(torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers))
    test_dataset = dset.MNIST(root=dataroot, download=True, train=False, transform=data_transforms)
    test_x, test_y = get_numpy_data(torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers))
    
    x = np.vstack([train_x, test_x])
    y = np.concatenate([train_y, test_y])
    return x, y

def get_mnist_3c_data(image_size, batch_size, dataroot='../data/', workers=2, data_transforms=None):
    if data_transforms is None:
        data_transforms = transforms.Compose([
                                transforms.Scale(image_size),
                                transforms.Grayscale(3),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                            ])
    train_dataset = dset.MNIST(root=dataroot, download=True, train=True, transform=data_transforms)
    train_x, train_y = get_numpy_data(torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers))
    test_dataset = dset.MNIST(root=dataroot, download=True, train=False, transform=data_transforms)
    test_x, test_y = get_numpy_data(torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                            shuffle=False, num_workers=workers))
    
    x = np.vstack([train_x, test_x])
    y = np.concatenate([train_y, test_y])
    return x, y

def get_flickr_data(image_size, dataroot='../data/Flickr25K', workers=2, data_transforms=None):
    data_transforms = transforms.Compose([
                                transforms.Scale(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))])
    dataset = torchvision.datasets.ImageFolder(dataroot, transform=data_transforms)

    loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0)
    
    test_x, test_y = get_numpy_data(loader)
    
    x = np.vstack([train_x, test_x])
    y = np.concatenate([train_y, test_y])
    return x, y

In [None]:
def sample_files_from_list(basedir, file_list, n_per_class, seed, ignored_file_list=set()):
    sampled_files = {}
    permuted_indices = np.arange(len(file_list))
    print('Setting seed {}'.format(seed))
    np.random.seed(seed)
    np.random.shuffle(permuted_indices)
    selected_files = []
    for idx in tqdm(permuted_indices):
        filename = file_list[idx]
        if filename not in ignored_file_list:
            _, label, img_filename = filename.split('/')
            if label not in sampled_files:
                sampled_files[label] = []

            if len(sampled_files[label]) < n_per_class:
                sampled_files[label].append((img_filename, path.join(basedir, filename)))
                selected_files.append(filename)
    for label, img_list in sampled_files.items():
        assert len(img_list) == n_per_class
    return sampled_files, selected_files

def sample_train_db_data_from_dataloader(dataloader, num_train, num_db, seed):
    x, y = get_numpy_data(dataloader)
    assert (num_train + num_db) == x.shape[0]
    
    print('Setting seed {}'.format(seed))
    train_x, db_x, train_y, db_y = train_test_split(x, y, train_size = num_train, random_state=seed, stratify = y)
    
    return train_x, train_y, db_x, db_y    

In [None]:
def make_dir_if_not_exist(folder):
    if not path.exists(folder):
        # print('Creating folder: {}'.format(folder))
        os.makedirs(folder)
        
def create_dataset_from_files(basedir, sampled_files):
    if path.exists(basedir):
        raise Exception('Directory already exists: {}'.format(basedir))
    pbar = tqdm(sampled_files.items())
    cnt = 0
    try:
        for label, img_list in pbar :
            label_dir = path.join(basedir, label)
            make_dir_if_not_exist(label_dir)

            for img_filename, img_path in img_list:
                cnt += 1
                shutil.copyfile(img_path, path.join(label_dir, img_filename))
                if cnt %500 == 0:
                    pbar.set_postfix(file_cnt=cnt)
        pbar.set_postfix(file_cnt=cnt)
    finally:
        pbar.close()
        
def check_evenly_sampling(a):
    cnts = np.sum(ranking.one_hot_label(a), axis=0)
    for cnt in cnts:
        assert cnt == cnts[0]

In [None]:
IMAGE_SIZE = 64

# MNIST-3C

MNIST data with 3 channels (stacking the same copy of the 1-channel)

In [None]:
all_x, all_y = get_mnist_3c_data(IMAGE_SIZE, 100, dataroot='../data/', workers=0)
dataset = 'mnist-3c'
NUM_IMAGES = all_x.shape[0]
print('Dataset: {} images'.format(NUM_IMAGES))
print('Data range: [{}, {}]'.format(all_x.min(), all_x.max()))

In [None]:
# DCW-AE paper
for seed, num_query in [
        (9, 10000), 
        (19, 10000), 
        (29, 10000),
        (39, 10000),
        (49, 10000)
    ]:
    num_train = num_db = NUM_IMAGES - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, db_y = train_x, train_y

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

In [None]:
# This is used in DistillHash, SSDH papers
for seed, num_train, num_query in [
        (109, 5000, 10000), 
        (119, 5000, 10000), 
        (129, 5000, 10000),
        (139, 5000, 10000),
        (149, 5000, 10000),
    ]:
    num_db = NUM_IMAGES - num_train - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    
    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, query_x, db_y, query_y = train_test_split(
        query_x, query_y, train_size = num_db, random_state=seed, stratify = query_y)

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

# MNIST

In [None]:
all_x, all_y = get_mnist_data(IMAGE_SIZE, 100, dataroot='../data/', workers=0)
dataset = 'mnist'
NUM_IMAGES = all_x.shape[0]
print('Dataset: {} images'.format(NUM_IMAGES))
print('Data range: [{}, {}]'.format(all_x.min(), all_x.max()))

In [None]:
# DCW-AE paper
for seed, num_query in [
        (9, 10000), 
        (19, 10000), 
        (29, 10000),
        (39, 10000),
        (49, 10000)
    ]:
    num_train = num_db = NUM_IMAGES - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, db_y = train_x, train_y

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

In [None]:
# This is used in DistillHash, SSDH papers
for seed, num_train, num_query in [
        (109, 5000, 10000), 
        (119, 5000, 10000), 
        (129, 5000, 10000),
        (139, 5000, 10000),
        (149, 5000, 10000),
    ]:
    num_db = NUM_IMAGES - num_train - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    
    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, query_x, db_y, query_y = train_test_split(
        query_x, query_y, train_size = num_db, random_state=seed, stratify = query_y)

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

# Flickr25k

In [None]:
dataset = 'flickr25k'
image_size=IMAGE_SIZE
dataroot='../data/Flickr25K/'
workers=0
data_transforms = transforms.Compose([
                                transforms.Resize(image_size),
                                transforms.CenterCrop(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
loader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(dataroot, transform=data_transforms), 
                                     batch_size=100, shuffle=True, num_workers=0)

all_x, all_y = get_numpy_data(loader)

In [None]:
NUM_IMAGES = all_x.shape[0]
print('Dataset: {} images'.format(NUM_IMAGES))
print('Data range: [{}, {}]'.format(all_x.min(), all_x.max()))

In [None]:
# DCW-AE paper
for seed, num_query in [
        (9, 5000), 
        (19, 5000), 
        (29, 5000),
        (39, 5000),
        (49, 5000)
    ]:
    num_train = num_db = NUM_IMAGES - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, db_y = train_x, train_y

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

# CIFAR-10

In [None]:
dataset = 'cifar10'

train_dataloader, query_dataloader = get_cifar10_data(IMAGE_SIZE, 100, dataroot='../data/', workers=0)
train_x, train_y = get_numpy_data(train_dataloader)
query_x, query_y = get_numpy_data(query_dataloader)
all_x = np.vstack([train_x, query_x])
all_y = np.concatenate([train_y, query_y])
NUM_IMAGES = all_x.shape[0]
print('Dataset: {} images'.format(NUM_IMAGES))
print('Data range: [{}, {}]'.format(all_x.min(), all_x.max()))

In [None]:
# DCW-AE paper
for seed, num_query in [
        (9, 10000), 
        (19, 10000), 
        (29, 10000),
        (39, 10000),
        (49, 10000)
    ]:
    num_train = num_db = NUM_IMAGES - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)
    
    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, db_y = train_x, train_y

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

In [None]:
# This is used in DistillHash, SSDH papers
for seed, num_train, num_query in [
        (109, 5000, 10000), 
        (119, 5000, 10000), 
        (129, 5000, 10000),
        (139, 5000, 10000),
        (149, 5000, 10000),
    ]:
    num_db = NUM_IMAGES - num_train - num_query
    output_dir = '../data/{}_isize{}_seed{}'.format(dataset, IMAGE_SIZE, seed)

    print('Setting seed {}: {} train, {} query, {} db'.format(seed, num_train, num_query, num_db))
    if path.exists(output_dir):
        print('Deleting existing folder: {}'.format(output_dir))
        shutil.rmtree(output_dir)
    print('Will save in {}'.format(output_dir))
    os.makedirs(output_dir)

    
    train_x, query_x, train_y, query_y = train_test_split(
        all_x, all_y, train_size = num_train, random_state=seed, stratify = all_y)
    db_x, query_x, db_y, query_y = train_test_split(
        query_x, query_y, train_size = num_db, random_state=seed, stratify = query_y)

    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'query')), x = query_x, y=query_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'train')), x = train_x, y=train_y)
    np.savez_compressed(path.join(output_dir, '{}_{}_manual_{}.npz'.format(dataset, IMAGE_SIZE, 'db')), x = db_x, y=db_y)

# END