In [1]:

from pathlib import Path
import os
import glob
import h5py
import traceback
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, RandomSampler, DataLoader
import torch
import numpy as np

In [2]:
project_dir = Path(os.getcwd()).parent

In [3]:
data_dir = os.path.join(project_dir, "data")
output_dir = os.path.join(project_dir, "converted_data")
data_dir, output_dir


('/p/vast1/haridev/jobspec-database/data',
 '/p/vast1/haridev/jobspec-database/converted_data')

In [41]:
collected_size = 0
collected_counter = 0
max_size = 1024*1024*32
h5_fname = f"{output_dir}/job_scripts_{collected_counter}.h5"
h5_file = None
try:
    h5_file = h5py.File(h5_fname,'w')
    files = glob.iglob(f"{data_dir}/**/*", recursive=True)
    for filename in tqdm(files):
        path = os.path.abspath(filename)
        filename = os.path.relpath(path, data_dir)
        dirname = os.path.dirname(filename)
        filename = os.path.basename(filename)
        if os.path.isfile(path):
            # print(filename, dirname)
            with open(path, "rb") as in_file:
                data = in_file.read()
                size = len(data)
            if size > 0:
                if collected_size + size > max_size:
                    # print(f"Collected {collected_size} bytes into {h5_fname}")
                    h5_file.close()
                    collected_counter += 1
                    h5_fname = f"{output_dir}/job_scripts_{collected_counter}.h5"
                    collected_size = 0
                    h5_file = h5py.File(h5_fname,'w')
                collected_size += size
                if dirname not in h5_file.keys():
                    h5_group = h5_file.create_group(dirname)
                else:
                    h5_group = h5_file[dirname]
                ds = h5_group.create_dataset(filename, shape=1, dtype=h5py.string_dtype(length=size), chunks=tuple([1]))
                ds[:] = data
    h5_file.close()
except Exception as err:
    print(err)
    print(traceback.format_exc())
    if h5_file:
        h5_file.close()    

0it [00:00, ?it/s]

In [42]:
import itertools

from collections import Iterable
def flatten(coll):
    for i in coll:
            if isinstance(i, Iterable) and not isinstance(i, str):
                for subc in flatten(i):
                    yield subc
            else:
                yield i
def get_recursive_path(obj, current):
    invert_op = getattr(obj, "keys", None)
    # print(current, invert_op)
    if invert_op is not None:
        keys = obj.keys()
        if len(keys) > 0:
            values = []
            for key in keys:
                new = f"{current}/{key}"
                values.append(get_recursive_path(obj[key], new))
            return flatten(values)
    else:
        return current

files = glob.iglob(f"{output_dir}/*.h5", recursive=True)
job_spec_idx = []
for file in files:
    h5_file = h5py.File(file,'r')
    for value in get_recursive_path(h5_file, ""):
        job_spec_idx.append((file, value))
job_spec_idx[:10]

[('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/002311-A/csci-467-project/bert_baseline.job'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/002311-A/csci-467-project/jobspec-cfg/bert_baseline_lime_args.py'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/00dylan00/siganturizer3D_models/scripts/slurm.bottom.all.limited.sh'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/00dylan00/siganturizer3D_models/scripts/slurm.bottom.all.sh'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/00dylan00/siganturizer3D_models/scripts/slurm.top.all.sh'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/01-vyom/melanoma-classification/src/BYOL/resnet.sh'),
 ('/p/vast1/haridev/jobspec-database/converted_data/job_scripts_0.h5',
  '/02zx/02zx.github.io/tutorial/free%20energy/water/Thermodynamics%20Integration/run.sh'),
 ('/p/vast1/haridev/jobs

In [43]:
def get_item(fname, dataset):
    h5_file = h5py.File(fname,'r')
    data = h5_file[dataset][0]
    h5_file.close()
    return data
    
class JobSpecDataset(Dataset):    
    def __init__(self, sample_map):
        self.sample_map = sample_map
        
    def __len__(self):
        return len(self.sample_map)

    def __getitem__(self, image_idx):
        fname, dataset = self.sample_map[image_idx]
        data = get_item(fname, dataset)
        # print(data)
        return (image_idx, np.array([data]))

In [44]:
dataset = JobSpecDataset(job_spec_idx)

In [45]:
num_workers = 4
if num_workers==0:
    kwargs={}
else:
    kwargs={'prefetch_factor': 16} # Can store 16 samples

In [46]:
torch.manual_seed(100)
seed = int(torch.empty((), dtype=torch.int64).random_().item())
# generator needs to load up torch seed.
torch_generator = torch.Generator()
torch_generator.manual_seed(seed)
# Pass generator to sampler
sampler = RandomSampler(dataset, generator=torch_generator)

In [47]:
my_dataset = DataLoader(dataset, batch_size=None, 
                              sampler=sampler,
                              num_workers=num_workers,
                              **kwargs)

In [48]:
batches = 0
for batch in my_dataset:
    print(len(batch), batch)
    if batches > 10:
        break
    batches = batches + 1

2 [38715, array([b'#PBS -N rrnabl_mn_0_ans-1000_a\n#PBS -l walltime=5:00:00\n#Name of job\n#Dep name , project name\n#PBS -P cse\n##PBS -P darpa.ml.cse\n##PBS -P parags.p2.54\n##PBS -q high \n#PBS -j oe\n#PBS -m bea\n### Specify email address to use for notification.\n#PBS -M $USER@iitd.ac.in\n#PBS -l select=3:ngpus=2:ncpus=3:centos=skylake\n##PBS -l select=3:ngpus=2:ncpus=2:centos=skylake\n## SPECIFY JOB NOW\n\nCURTIME=$(date +%Y%m%d%H%M%S)\n##module load apps/pythonpackages/3.6.0/pytorch/0.4.1/gpu\n##module load apps/anaconda3/4.6.9\n##module load apps/anaconda/3\n##module load apps/pytorch/1.5.0/gpu/anaconda3\n## Change to dir from where script was launched\n\n\n\n\ncount=0\n\n\ndeclare -a var\ninit_count=$count \nwhile read p; do\n      echo $p\n      #script="source /usr/share/Modules/3.2.10/init/bash && CUDA_VISIBLE_DEVICES=0 nohup /home/yatin/phd/misc-scripts/hpcv2/hpc_jobs/rrn_equal_abl_rep/exp_${count}.sh > /home/yatin/phd/misc-scripts/hpcv2/hpc_jobs/rrn_equal_abl_rep/LOG_${co