# Dataset sampling

Number of non near-duplicate image pairs approximately equal to near-duplicate image pairs
(2021, Sensors, Yi Zhang et al.)

In [1]:
import os
from itertools import combinations
import numpy as np
from scipy import sparse
from tqdm import tqdm
from torchvision import transforms
from torchvision.utils import save_image
from datasets import ImageDataset, MirFlickr1MDataset


# early transform
trsfm = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Resize(
            (224, 224), interpolation=transforms.InterpolationMode.BILINEAR
        ),
    ]
)

# California-ND Dataset Sampling

In [2]:
# Number of non near-duplicate image pairs approximately equal to near-duplicate image pairs
# (2021, Sensors, Yi Zhang et al.)
# near-duplicate pairs

root = '/datasets/'
sample_name = "sample-california-nd-2022-03-31"
corr_threshold = 0.5 # Yi Zhang used 0.5 threshold

# Create sample dataset directory
if not os.path.exists(root + sample_name):
    sample_dir = root + sample_name
    os.makedirs(sample_dir)

if not os.path.exists(root + sample_name + "/images/"):
    sample_images_dir = sample_dir + "/images/"
    os.makedirs(sample_images_dir)

# Load nd matrix with correlation threshold
nd_matrix = (
    np.load(root + "california-nd/Correlation_matrices/gt_all.npy")
    >= corr_threshold
).astype(np.int32)

# Sampling near-duplicate pairs
nd_pairs = nd_matrix.nonzero()
nd_pairs = [(nd_pairs[0][i], nd_pairs[1][i]) for i in range(len(nd_pairs[0])) if nd_pairs[0][i] < nd_pairs[1][i]]
num_nd_pairs = len(nd_pairs)

# Sampling non-near-duplicate pairs
i = 0
nnd_pairs = []
checked = nd_matrix.astype(bool)
len_images = checked.shape[0]

while i < num_nd_pairs:
    random_pair = np.random.randint(len_images, size=2)
    if random_pair[0] < random_pair[1]:
        if not checked[random_pair[0], random_pair[1]]:
            nnd_pairs.append((random_pair[0], random_pair[1]))
            checked[random_pair[0], random_pair[1]] = True
            i += 1
    else:
        if not checked[random_pair[1], random_pair[0]]:
            nnd_pairs.append((random_pair[1], random_pair[0]))
            checked[random_pair[1], random_pair[0]] = True
            i += 1

# Copy images and write pairs
image_dataset = ImageDataset(root + "california-nd/Photos/", transform=trsfm)
checked = np.zeros(len(image_dataset), dtype=bool)
with open(sample_dir + "/nd_pairs.txt", "w") as f:
    for i, j in tqdm(nd_pairs, desc="Save ND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(image_dataset[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(image_dataset[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

with open(sample_dir + "/nnd_pairs.txt", "w") as f:
    for i, j in tqdm(nnd_pairs, desc="Save NND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(image_dataset[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(image_dataset[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

Save ND pairs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2607/2607 [00:07<00:00, 365.64it/s]
Save NND pairs: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2607/2607 [00:01<00:00, 1847.62it/s]


# MFND-IND Dataset Sampling

In [2]:
# Number of non near-duplicate image pairs approximately equal to near-duplicate image pairs
# (2021, Sensors, Yi Zhang et al.)
# near-duplicate pairs

root = '/datasets/'
sample_name = "sample-mfnd-ind-2022-03-31"

# Create sample dataset directory
if not os.path.exists(root + sample_name):
    sample_dir = root + sample_name
    os.makedirs(sample_dir)

if not os.path.exists(root + sample_name + "/images/"):
    sample_images_dir = sample_dir + "/images/"
    os.makedirs(sample_images_dir)

# Create empty near duplicate sparse matrix with (1000000, 1000000) shape
nd_matrix = sparse.lil_matrix((1000000, 1000000), dtype=np.int32)

# Duplicate pair : 1
with open(root + "mfnd/" + "duplicates.txt") as f:
    lines = f.readlines()

    for line in lines:
        for i, j in combinations(line.strip().split(" "), 2):
            if i < j:
                nd_matrix[int(i), int(j)] = 1
            else:
                nd_matrix[int(j), int(i)] = 1

# IND pair : 1
with open(root + "mfnd/" + "IND_clusters.txt") as f:
    lines = f.readlines()

    for line in lines:
        for i, j in combinations(line.strip().split(" "), 2):
            if i < j:
                nd_matrix[int(i), int(j)] = 1
            else:
                nd_matrix[int(j), int(i)] = 1

# Sampling non-near-duplicate pairs
nd_pairs = nd_matrix.nonzero()
nd_pairs = [(nd_pairs[0][i], nd_pairs[1][i]) for i in range(len(nd_pairs[0]))]
num_nd_pairs = len(nd_pairs)

# non near-duplicate pairs
i = 0
nnd_pairs = []
checked = sparse.lil_matrix(nd_matrix.shape, dtype=bool)

len_images = 1000000
while i < num_nd_pairs:
    random_pair = np.random.randint(len_images, size=2)
    if random_pair[0] < random_pair[1]:
        if not checked[random_pair[0], random_pair[1]]:
            nnd_pairs.append((random_pair[0], random_pair[1]))
            checked[random_pair[0], random_pair[1]] = True
            i += 1
    else:
        if not checked[random_pair[1], random_pair[0]]:
            nnd_pairs.append((random_pair[1], random_pair[0]))
            checked[random_pair[1], random_pair[0]] = True
            i += 1

# save nd pairs and sample images
mirflickrs = MirFlickr1MDataset(root=root, transform=trsfm)
checked = np.zeros(1000000, dtype=bool)
with open(root + sample_name + "/nd_pairs.txt", "w") as f:
    for i, j in tqdm(nd_pairs, desc="Save ND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(mirflickrs[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(mirflickrs[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

# save nnd pairs
checked = np.zeros(1000000, dtype=bool)
with open(root + sample_name + "/nnd_pairs.txt", "w") as f:
    for i, j in tqdm(nnd_pairs, desc="Save NND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(mirflickrs[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(mirflickrs[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

Save ND pairs: 100%|██████████| 5036/5036 [00:49<00:00, 101.43it/s]
Save NND pairs: 100%|██████████| 5036/5036 [03:35<00:00, 23.38it/s]


# MFND-ALL Dataset Sampling

In [4]:
# Number of non near-duplicate image pairs approximately equal to near-duplicate image pairs
# (2021, Sensors, Yi Zhang et al.)
# near-duplicate pairs

root = '/datasets/'
sample_name = "sample-mfnd-all-2022-03-31"

# Create sample dataset directory
if not os.path.exists(root + sample_name):
    sample_dir = root + sample_name
    os.makedirs(sample_dir)

if not os.path.exists(root + sample_name + "/images/"):
    sample_images_dir = sample_dir + "/images/"
    os.makedirs(sample_images_dir)

# Create empty near duplicate sparse matrix with (1000000, 1000000) shape
nd_matrix = sparse.lil_matrix((1000000, 1000000), dtype=np.int32)

# Duplicate pair : 1
with open(root + "mfnd/" + "duplicates.txt") as f:
    lines = f.readlines()

    for line in lines:
        for i, j in combinations(line.strip().split(" "), 2):
            if i < j:
                nd_matrix[int(i), int(j)] = 1
            else:
                nd_matrix[int(j), int(i)] = 1

# IND pair : 1
with open(root + "mfnd/" + "IND_clusters.txt") as f:
    lines = f.readlines()

    for line in lines:
        for i, j in combinations(line.strip().split(" "), 2):
            if i < j:
                nd_matrix[int(i), int(j)] = 1
            else:
                nd_matrix[int(j), int(i)] = 1

# NIND pair : 1
with open(root + "mfnd/" + "NIND_clusters.txt") as f:
    lines = f.readlines()

    for line in lines:
        for i, j in combinations(line.strip().split(" "), 2):
            if i < j:
                nd_matrix[int(i), int(j)] = 1
            else:
                nd_matrix[int(j), int(i)] = 1

# Sampling non-near-duplicate pairs
nd_pairs = nd_matrix.nonzero()
nd_pairs = [(nd_pairs[0][i], nd_pairs[1][i]) for i in range(len(nd_pairs[0]))]
num_nd_pairs = len(nd_pairs)

# non near-duplicate pairs
i = 0
nnd_pairs = []
checked = sparse.lil_matrix(nd_matrix.shape, dtype=bool)

len_images = 1000000
while i < num_nd_pairs:
    random_pair = np.random.randint(len_images, size=2)
    if random_pair[0] < random_pair[1]:
        if not checked[random_pair[0], random_pair[1]]:
            nnd_pairs.append((random_pair[0], random_pair[1]))
            checked[random_pair[0], random_pair[1]] = True
            i += 1
    else:
        if not checked[random_pair[1], random_pair[0]]:
            nnd_pairs.append((random_pair[1], random_pair[0]))
            checked[random_pair[1], random_pair[0]] = True
            i += 1

# save nd pairs and sample images
mirflickrs = MirFlickr1MDataset(root=root, transform=trsfm)
checked = np.zeros(1000000, dtype=bool)
with open(root + sample_name + "/nd_pairs.txt", "w") as f:
    for i, j in tqdm(nd_pairs, desc="Save ND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(mirflickrs[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(mirflickrs[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

# save nnd pairs
checked = np.zeros(1000000, dtype=np.bool)
with open(root + sample_name + "/nnd_pairs.txt", "w") as f:
    for i, j in tqdm(nnd_pairs, desc="Save NND pairs"):
        f.write("{} {}\n".format(i, j))
        if not checked[i]:
            save_image(mirflickrs[i][0], sample_images_dir + str(i) + ".jpg")
            checked[i] = True
        if not checked[j]:
            save_image(mirflickrs[j][0], sample_images_dir + str(j) + ".jpg")
            checked[j] = True

Save ND pairs: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 23178/23178 [07:36<00:00, 50.76it/s]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  checked = np.zeros(1000000, dtype=np.bool)
Save NND pairs: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 23178/23178 [16:26<00:00, 23.49it/s]
