In [1]:
!nvidia-smi

Sun Nov  8 19:48:06 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:0A:00.0  On |                  N/A |
|  0%   54C    P2   194W / 300W |  10201MiB / 11016MiB |     77%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:0B:00.0 Off |                  N/A |
|  0%   52C    P2   197W / 260W |  10614MiB / 11019MiB |     85%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

### original datamgr

In [2]:
from data.datamgr import SetDataManager, DataManager

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
base_file = 'filelists/omniglot/base.json'
image_size = 28
n_query = 15
few_shot_params = dict(n_way = 3, n_support = 5)

In [4]:
base_datamgr = SetDataManager(image_size, n_query = n_query,  **few_shot_params)
base_loader = base_datamgr.get_data_loader( base_file , aug = False)

  "please use transforms.Resize instead.")


In [5]:
first_data = base_loader.dataset[0]
print('first_data[0].type():', first_data[0].type())
print('first_data[1].type():', first_data[1].type())
for i, (x,_ ) in enumerate(base_loader):
    print(x.shape)
    print(_.shape)
    print(x.type())
    print(_.type())
    break

first_data[0].type(): torch.FloatTensor
first_data[1].type(): torch.LongTensor
torch.Size([3, 20, 3, 28, 28])
torch.Size([3, 20])
torch.FloatTensor
torch.LongTensor


### customized datamgr

In [6]:
import numpy as np
# import torch
# import torchvision.transforms as transforms
# from data.dataset import EpisodicBatchSampler
from data.datamgr import VirtualSetDataManager, load_npz_dataset
# from data.dataset import VirtualSetDataset
print(np.__version__)

1.18.4


In [7]:
n_dims = 100
n_all_classes = 600 # base max 400 / source_val 100 / source_novel 100
n_samples_per_class = 40 # i think 40 is enough
n_classes = {
    'base':400, 'val':100, 'novel':100, 
    'base50cl':50, 'base100cl':100, 'base200cl':200,}
distrib_center = np.zeros(n_dims)
distrib_std = 20 * np.ones(n_dims)
# base_distrib_radius = 1
# base_x_std = 1
cls_x_std = 1

n_target_classes = 200
n_val_classes = 100
n_novel_classes = 100
target_distrib_center = 1 * np.ones(n_dims)
target_distrib_radius = 1
target_x_std = 1
target_cls_x_std = 0.1

In [8]:
n_base_informative = 20 # first 20 features
n_target_informative = 50 # first 50 features
assert n_base_informative <= n_dims
assert n_target_informative <= n_dims

In [9]:
import os
'''
base dataset: 
    different n_classes
    (different n_samples_per_class?)
    
target dataset:
    different domain shift
'''

# def load_dataset(path):
#     assert '.npz' in path, 'load path should be .npz file'
#     data = np.load(path)
#     return data['X'], data['y']

class DatasetGenerator:
    def __init__(self, n_dims, n_all_classes, n_classes, n_samples_per_class):#, datafolder='./'):
        self.n_dims = n_dims
        self.n_all_classes = n_all_classes
        self.n_classes = n_classes # dictionary base50cl/base400cl/val/novel
        self.n_samples_per_class = n_samples_per_class
        self.n_samples = n_all_classes * n_samples_per_class
#         self.datafolder = datafolder
        assert n_all_classes == n_classes['base'] + n_classes['val'] + n_classes['novel']
        
    def gen_random_dataset(self, save_path=None):
        n_samples = self.n_samples_per_class * self.n_all_classes
        X_shape = (n_samples, self.n_dims)
        X = np.random.random(X_shape)
        y = np.repeat(np.arange(self.n_all_classes),self.n_samples_per_class,axis=0) # [0 0 1 1 2 2 ...]
        if save_path is not None:
            assert '.npz' in save_path, 'save path should be .npz file'
            np.savez(save_path, X=X, y=y)#, X_mean=X.mean(axis=0), X_std=X.std(axis=0))
        return X, y
        
    def gen_Gaussian_datasets(self, datafolder, distrib_center, distrib_std, cls_x_std, informative_interval):
        ''' generate base100cl, base200cl, base400cl, val, novel datasets
        '''
        n_informative = informative_interval[1] - informative_interval[0] + 1
        distrib_center_info_feat = distrib_center[informative_interval[0]:informative_interval[1]+1]
        distrib_std_info_feat = distrib_std[informative_interval[0]:informative_interval[1]+1] # actually no need that complex currently
        informative_x_centers = np.random.normal(
            loc = distrib_center_info_feat, scale = distrib_std_info_feat, 
            size = (self.n_all_classes, n_informative)
        ) # shape: (n_all_classes, n_informative)
#         print(informative_x_centers.shape)
        X_info = []
        for cl in range(self.n_all_classes):
            info_x_center = informative_x_centers[cl]
            cl_X_info = np.random.normal(
                loc = info_x_center, scale = cls_x_std, 
                size = (self.n_samples_per_class, n_informative)
#                 size = (n_informative, self.n_samples_per_class) # this would get error @@
            )
            X_info.append(cl_X_info)
        X_info = np.concatenate(X_info, axis=0)
        X_noninfo_center = distrib_center[:-n_informative] # hack (actually should be dimensions except informative_interval)
        X_noninfo_std = distrib_std[:-n_informative] # hack (actually should be dimensions except informative_interval)
        X_noninfo = np.random.normal(
            loc = X_noninfo_center, scale = X_noninfo_std, 
            size = (self.n_samples, self.n_dims-n_informative)
        )
        X1 = X_noninfo[:, :informative_interval[0]] # hack
        X2 = X_noninfo[:, informative_interval[0]:] # hack
        X_all = np.concatenate(
            (X1, X_info, X2)
            , axis=1)
        y_all = np.repeat(np.arange(self.n_all_classes),self.n_samples_per_class,axis=0) # [0 0 1 1 2 2 ...]
        test_dataset_ls = ['val', 'novel']
        Xs = {}
        ys = {}
#         for split in test_dataset_ls:
        for split in self.n_classes.keys():
            dataset_n_classes = self.n_classes[split]
            dataset_n_samples = dataset_n_classes * self.n_samples_per_class
            if 'base' in split:
                X = X_all[:dataset_n_samples]
                y = y_all[:dataset_n_samples]
            elif split == 'novel':
                X = X_all[-dataset_n_samples:]
                y = y_all[-dataset_n_samples:]
            elif split == 'val':
                novel_n_samples = self.n_classes['novel'] * self.n_samples_per_class
                X = X_all[-novel_n_samples-dataset_n_samples:-novel_n_samples]
                y = y_all[-novel_n_samples-dataset_n_samples:-novel_n_samples]
            else:
                raise ValueError('Unknown split: %s'%(split))

            Xs[split] = X
            ys[split] = y
            filename = split + '.npz'
            out_path = os.path.join(datafolder, filename)
            print('Saving file: %s'%(out_path))
            if not os.path.exists(datafolder):
                print('Folder not exist: "%s"'%(datafolder))
                print('Making directory...')
                os.makedirs(datafolder)
            np.savez(out_path, X=X, y=y)
        
        return Xs, ys
            
    
    

In [10]:
# filepath = 'filelists/virtual_20info/try_base200cl.npz'

datafolder = './'
dataset_generator = DatasetGenerator(
    n_dims = n_dims, n_all_classes = n_all_classes, n_classes = n_classes, 
    n_samples_per_class = n_samples_per_class)#, datafolder = datafolder)

In [11]:
should_gen_npy = False
if should_gen_npy:
    filepath = '000.npz'
    X, y = dataset_generator.gen_random_dataset(save_path=file_path)
    print(X.shape)
    print(y.shape)
    X, y = load_npz_dataset(file_path)
    print(X.shape)
    print(y.shape)

In [12]:
should_gen_Gaussian = True
informative_interval = (30, 59)
info_int_s1 = str(informative_interval[0]).zfill(2)
info_int_s2 = str(informative_interval[1]).zfill(2)
datafolder = 'filelists/virtual_info%s%s/'%(info_int_s1, info_int_s2)
print('datafolder:', datafolder)
if should_gen_Gaussian:
    dataset_generator.gen_Gaussian_datasets(
        datafolder = datafolder, 
        distrib_center=distrib_center, distrib_std=distrib_std, 
        cls_x_std=cls_x_std, informative_interval=informative_interval)

datafolder: filelists/virtual_info3059/
Saving file: filelists/virtual_info3059/base.npz
Folder:
Saving file: filelists/virtual_info3059/val.npz
Saving file: filelists/virtual_info3059/novel.npz
Saving file: filelists/virtual_info3059/base50cl.npz
Saving file: filelists/virtual_info3059/base100cl.npz
Saving file: filelists/virtual_info3059/base200cl.npz


In [13]:
aaa

NameError: name 'aaa' is not defined

In [None]:
base_datamgr = VirtualSetDataManager(in_dim = n_dims, n_query = 15,  **few_shot_params)
base_loader = base_datamgr.get_data_loader( filepath = filepath, aug = False)

In [None]:
first_data = base_loader.dataset[0]
print('first_data[0].type():', first_data[0].type())
print('first_data[1].type():', first_data[1].type())

for i, (x,y ) in enumerate(base_loader):
    print(x.shape)
    print(y.shape)
    print(x.type())
    print(y.type())
    break