In [1]:
from datatype_recovery.models.dataset import TypeSequenceDataset

# COPIED for testing...
MAX_HOPS = 3

data_params = {
    'experiment_runs': [
        '/home/cls0027/exp_builds/astera.exp/rundata/run1',
    ],
    'copy_data': False,
}

# NOTE:
# - 4096 is NOT that much more than 1024 which ran pretty fast...
#       >> my "batchsize" (# data points saved in one chunk file) is 1000...just under 1024
#
#       >>>>>> I ALSO DID NOT SHUFFLE THE DATASET BEFORE!! <<<<<<
#
# So not only was the dataset reading from files, but also after shuffling I think
# it was THRASHING back and forth across completely separate files...
# - since it is a not-in-memory dataset it can't assume everything fits...probably
#   read in each file froms scratch every time!!

# > so we could increase batchsize too...

dataset = TypeSequenceDataset('trainset_astera', data_params, max_hops=MAX_HOPS)

input_params will be IGNORED in favor of saved .json file (trainset_astera/raw/input_params.json)


In [2]:
dataset.root
dataset.raw_file_names
dataset.processed_file_names

['processing_finished']

In [3]:
import subprocess
import torch
from torch_geometric.data import InMemoryDataset, download_url

import tqdm
from tqdm.auto import trange
from pathlib import Path

# TODO: put this (dup) file IN the same folder...for now

class InMemTypeSequenceDataset(InMemoryDataset):
    '''
    For datasets that fit, use an in-memory dataset for HUGE performance boost!!
    '''
    def __init__(self, dataset:TypeSequenceDataset):
        self.src_dataset = dataset

        super().__init__(dataset.root,
            transform=dataset.transform,
            pre_transform=dataset.pre_transform,
            pre_filter=dataset.pre_filter)

        self.load(self.processed_paths[0])
        # For PyG<2.4:
        # self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        # we need these to be here before we can copy to our version
        return self.src_dataset.raw_file_names

    @property
    def processed_file_names(self):
        return ['IN_MEMORY_COPY.pt']

    def download(self):
        # Download to `self.raw_dir`
        print(f'Looks like the raw source dataset is missing for: {self.src_dataset}')

    def process(self):
        # Read data into huge `Data` list.
        for fname in self.src_dataset.processed_file_names:
            if not (Path(self.root)/'processed'/fname).exists():
                raise Exception(f'Looks like the raw source dataset is missing for: {self.src_dataset}')

        folder_size_str = subprocess.check_output(f'du -ch {self.src_dataset.root} | tail -1',
            shell=True).decode('utf-8').split()[0]
        print(f'Loading dataset into memory of size {folder_size_str}')

        # data_list = list(self.src_dataset)      # we assume it fits in memory...lol
        ds_len = len(self.src_dataset)
        data_list = [self.src_dataset[i] for i in trange(ds_len)]

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.save(data_list, self.processed_paths[0])
        # For PyG<2.4:
        # torch.save(self.collate(data_list), self.processed_paths[0])

In [4]:
in_mem = InMemTypeSequenceDataset(dataset)

Loading dataset into memory of size 149M


Processing...


  0%|          | 0/37376 [00:00<?, ?it/s]

Done!


In [8]:
data_list = [in_mem[i] for i in trange(len(in_mem))]

  0%|          | 0/37376 [00:00<?, ?it/s]

In [5]:
import subprocess
path = 'trainset_astera'
folder_size_str = subprocess.check_output(f'du -ch {path} | tail -1', shell=True).decode('utf-8').split()[0]
# num_bytes/2**20
folder_size_str

'269M'

In [6]:
!du -ch trainset_astera

12K	trainset_astera/raw
269M	trainset_astera/processed
269M	trainset_astera
269M	total


In [7]:
!du --help

Usage: du [OPTION]... [FILE]...
  or:  du [OPTION]... --files0-from=F
Summarize disk usage of the set of FILEs, recursively for directories.

Mandatory arguments to long options are mandatory for short options too.
  -0, --null            end each output line with NUL, not newline
  -a, --all             write counts for all files, not just directories
      --apparent-size   print apparent sizes, rather than disk usage; although
                          the apparent size is usually smaller, it may be
                          larger due to holes in ('sparse') files, internal
                          fragmentation, indirect blocks, and the like
  -B, --block-size=SIZE  scale sizes by SIZE before printing them; e.g.,
                           '-BM' prints sizes in units of 1,048,576 bytes;
                           see SIZE format below
  -b, --bytes           equivalent to '--apparent-size --block-size=1'
  -c, --total           produce a grand total
  -D, --dereference-args  deref