### Script for Converting Datasets into LMDB

Use this notebook as an example of saving the data with lmdb format.
We use this format to store/load large scale datasets such as ImageNetMini.

In [None]:
import argparse
import pickle
import lmdb
import os
from io import BytesIO
from matplotlib import pyplot as plt
from PIL import Image
from glob import glob
import shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image

import torchvision
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm import tqdm

In [None]:
def loads_data(buf):
    """
    Args:
        buf: the output of `dumps`.
    """
    return pickle.loads(buf)

def dumps_data(obj):
    """
    Serialize an object.
    Returns:
        Implementation-dependent bytes-like object
    """
    return pickle.dumps(obj)

def datasetImageNet(root='./data', train=True, transform=None):
    if train: root = os.path.join(root, 'ILSVRC2012_img_train')
    else: root = os.path.join(root, 'ILSVRC2012_img_val')
    return torchvision.datasets.ImageFolder(root=root, transform=transform)


def datasetImageNetMini(root='./data', train=True, transform=None):
    dataset = datasetImageNet(root=root, train=train, transform=transform)
    ''' imagenet-mini is a subset of the first 100 classes of ImageNet '''
    idx             = np.where( np.array(dataset.targets) < 100)[0]
    dataset.samples = [ dataset.samples[ii] for ii in idx ]
    dataset.targets = [ dataset.targets[ii] for ii in idx ]
    return dataset

In [None]:
# Prepare Dataset
base_size       = 256

train_transform = [transforms.Resize([base_size, base_size]),]
test_transform  = [transforms.Resize(base_size),
                   transforms.CenterCrop(base_size),]

train_transform = transforms.Compose(train_transform)
test_transform  = transforms.Compose(test_transform)
train_dataset   = datasetImageNetMini(root='/ImageNet_2012_PATH/', train=True, transform=train_transform)
test_dataset    = datasetImageNetMini(root='/ImageNet_2012_PATH/', train=False, transform=test_transform)

In [None]:
data    = np.zeros_like(noise, dtype=np.int16)
targets = [] 

for i in range(len(train_dataset)):
    data[i] = np.asarray(train_dataset[i][0], dtype=np.int16)
    targets.append(train_dataset[i][1])
    
    if i%5000==0:
        print(i)

In [None]:
name            = 'CLEAN'
write_frequency = 5000

lmdb_path = os.path.join('./data/', "%s.lmdb" % name)
isdir     = os.path.isdir(lmdb_path)

print("Generate LMDB to %s" % lmdb_path)

db = lmdb.open(lmdb_path, subdir=isdir,
               map_size=1099511627776 * 2, readonly=False,
               meminit=False, map_async=True)

txn = db.begin(write=True)

for idx in range(data.shape[0]):
    
    image  = Image.fromarray(np.uint8(data[idx]))
    buffer = BytesIO()
    image.save(buffer, format="png", quality=100)
    val   = buffer.getvalue()
    label = targets[idx]

    # Create a tuple of image and label
    imglabel_tuple = (val, label)

    txn.put(u'{}'.format(idx).encode('ascii'), dumps_data(imglabel_tuple))
    
    if idx % write_frequency == 0:
        print("[%d/%d]" % (idx, data.shape[0]))
        txn.commit()
        txn = db.begin(write=True)

# finish iterating through dataset
txn.commit()
keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
with db.begin(write=True) as txn:
    txn.put(b'__keys__', dumps_data(keys))
    txn.put(b'__len__', dumps_data(len(keys)))

print("Flushing database ...")
db.sync()
db.close()