In [1]:
from hangar import Repository

import numpy as np
import pickle
import gzip
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

- Time travel through the historical evolution of a dataset
- Zero-cost Branching to enable exploratory analysis and collaboration
- Cheap Merging to build datasets over time (with multiple collaborators)
- Completely abstracted organization and management of data files on disk
- Ability to only retrieve a small portion of the data (as needed) while still maintaining complete historical record
- Ability to push and pull changes directly to collaborators or a central server (ie a truly distributed version control system)


## Part 1: Branching & Merging

In [2]:
repo = Repository(path='/Users/rick/projects/tensorwerk/hangar/dev/mnist/')



In [3]:
repo.init(user_name='Rick Izzo', user_email='rick@tensorwerk.com', remove_old=True)

Hangar Repo initialized at: /Users/rick/projects/tensorwerk/hangar/dev/mnist/__hangar


'/Users/rick/projects/tensorwerk/hangar/dev/mnist/__hangar'

In [4]:
# Load the dataset
with gzip.open('/Users/rick/projects/tensorwerk/hangar/dev/data/mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='bytes')

def rescale(array):
    array = array * 256
    rounded = np.round(array)
    return rounded.astype(np.uint8())

sample_trimg = rescale(train_set[0][0])
sample_trlabel = np.array([train_set[1][0]])
trimgs = rescale(train_set[0])
trlabels = train_set[1]

sample_vimg = rescale(valid_set[0][0])
sample_vlabel = np.array([valid_set[1][0]])
vimgs = rescale(valid_set[0])
vlabels = valid_set[1]

sample_teimg = rescale(test_set[0][0])
sample_telabel = np.array([test_set[1][0]])
teimgs = rescale(test_set[0])
telabels = test_set[1]


In [5]:
co = repo.checkout(write=True)

dset_trimgs = co.datasets.init_dataset(name='train_images', prototype=sample_trimg)
dset_trlabels = co.datasets.init_dataset(name='train_labels', prototype=sample_trlabel)

pbar = tqdm(total=trimgs.shape[0])
with dset_trimgs, dset_trlabels:
    for idx, img in enumerate(trimgs):
        if (idx % 500 == 0):
            pbar.update(500)
        name = str(idx)
        dset_trimgs[name] = img
        dset_trlabels[name] = np.array([trlabels[idx]])
pbar.close()

co.metadata['hello'] = 'world'

co.commit('first commit adding training images and labels')

repo.log()

repo.create_branch('add-validation')
repo.create_branch('add-test')
repo.create_branch('untouched-live-demo-branch')

co.close()

co = repo.checkout(write=True, branch_name='add-validation')

dset_vimgs = co.datasets.init_dataset(name='validation_images', prototype=sample_vimg)
dset_vlabels = co.datasets.init_dataset(name='validation_labels', prototype=sample_vlabel)

pbar = tqdm(total=vimgs.shape[0])
with dset_vimgs, dset_vlabels:
    for idx, img in enumerate(vimgs):
        if (idx % 500 == 0):
            pbar.update(500)
        name = str(idx)
        dset_vimgs[name] = img
        dset_vlabels[name] = np.array([vlabels[idx]])
pbar.close()

co.commit('commit adding validation images and labels')

co.close()

co = repo.checkout(write=True, branch_name='add-test')

dset_teimgs = co.datasets.init_dataset(name='test_images', prototype=sample_teimg)

pbar = tqdm(total=teimgs.shape[0])
with dset_teimgs:
    for idx, img in enumerate(teimgs):
        if (idx % 500 == 0):
            pbar.update(500)
        name = str(idx)
        dset_teimgs[name] = img
pbar.close()

co.commit('added testing images only')

dset_telabels = co.datasets.init_dataset(name='test_labels', prototype=sample_telabel)

pbar = tqdm(total=telabels.shape[0])
with dset_telabels:
    for idx, lab in enumerate(telabels):
        if (idx % 500 == 0):
            pbar.update(500)
        name = str(idx)
        dset_telabels[name] = np.array([lab])
pbar.close()

co.commit('added testing labels only')
co.close()

Dataset Specification:: Name: `train_images`, Initialization style: `prototype`, Shape: `(784,)`, DType: `uint8`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(784,)`
Dataset Initialized: `train_images`
Dataset Specification:: Name: `train_labels`, Initialization style: `prototype`, Shape: `(1,)`, DType: `int64`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(1,)`
Dataset Initialized: `train_labels`


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))


Commit operation requested with message: first commit adding training images and labels
(6478291, 1570791, 65536)
removing all stage hash records
Commit completed. Commit hash: cd5593d4a87953cfc928597334a1b664c45557f4
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31mmaster[m) : first commit adding training images and labels
writer checkout of master closed
Dataset Specification:: Name: `validation_images`, Initialization style: `prototype`, Shape: `(784,)`, DType: `uint8`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(784,)`
Dataset Initialized: `validation_images`
Dataset Specification:: Name: `validation_labels`, Initialization style: `prototype`, Shape: `(1,)`, DType: `int64`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(1,)`
Dataset Initialized: `validation_labels`


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


Commit operation requested with message: commit adding validation images and labels
(7856529, 1888176, 65536)
removing all stage hash records
Commit completed. Commit hash: c2e49c3ef8eff20173f689af803c5d623d72c4fc
writer checkout of add-validation closed
Dataset Specification:: Name: `test_images`, Initialization style: `prototype`, Shape: `(784,)`, DType: `uint8`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(784,)`
Dataset Initialized: `test_images`


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


Commit operation requested with message: added testing images only
(7107399, 1848732, 65536)
removing all stage hash records
Commit completed. Commit hash: 7b7f24e64689bda604ae48280cae3877c334dfe1
Dataset Specification:: Name: `test_labels`, Initialization style: `prototype`, Shape: `(1,)`, DType: `int64`, Samples Named: `True`, Variable Shape: `False`, Max Shape: `(1,)`
Dataset Initialized: `test_labels`


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))


Commit operation requested with message: added testing labels only
(7736505, 1884188, 65536)
removing all stage hash records
Commit completed. Commit hash: 3115bf8611987cc5e661e55bf994423b60683f9c
writer checkout of add-test closed


In [6]:
repo.list_branch_names()

['add-test', 'add-validation', 'master', 'untouched-live-demo-branch']

In [7]:
repo.log(branch_name='add-test')

* 3115bf8611987cc5e661e55bf994423b60683f9c ([1;31madd-test[m) : added testing labels only
* 7b7f24e64689bda604ae48280cae3877c334dfe1 : added testing images only
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31mmaster[m) ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [8]:
repo.log(branch_name='add-validation')

* c2e49c3ef8eff20173f689af803c5d623d72c4fc ([1;31madd-validation[m) : commit adding validation images and labels
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31mmaster[m) ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [9]:
co = repo.checkout(write=True, branch_name='master')

In [10]:
repo.log()

* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31mmaster[m) ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [11]:
co.merge('merging the changes to the training datasets with the new additions of the validation datasets', dev_branch='add-validation')

Selected Fast-Forward Merge Stratagy
removing all stage hash records


'c2e49c3ef8eff20173f689af803c5d623d72c4fc'

In [12]:
repo.log('master')

* c2e49c3ef8eff20173f689af803c5d623d72c4fc ([1;31madd-validation[m) ([1;31mmaster[m) : commit adding validation images and labels
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [13]:
repo.log('add-test')

* 3115bf8611987cc5e661e55bf994423b60683f9c ([1;31madd-test[m) : added testing labels only
* 7b7f24e64689bda604ae48280cae3877c334dfe1 : added testing images only
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [14]:
co.merge('adding in the new testing datasets', dev_branch='add-test')

Selected 3-Way Merge Strategy
(9114743, 2201473, 65536)
removing all stage hash records


'3fd5d1fc99f08d901700c0c852a1575d7fc24df7'

In [15]:
co.close()

writer checkout of master closed


In [16]:
repo.log()

*   3fd5d1fc99f08d901700c0c852a1575d7fc24df7 ([1;31mmaster[m) : adding in the new testing datasets
[1;31m|[m[1;32m\[m  
[1;31m|[m * 3115bf8611987cc5e661e55bf994423b60683f9c ([1;31madd-test[m) : added testing labels only
[1;31m|[m * 7b7f24e64689bda604ae48280cae3877c334dfe1 : added testing images only
* [1;32m|[m c2e49c3ef8eff20173f689af803c5d623d72c4fc ([1;31madd-validation[m) : commit adding validation images and labels
[1;32m|[m[1;32m/[m  
* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [17]:
co = repo.checkout(write=True, branch_name='untouched-live-demo-branch')

In [18]:
repo.log()

* cd5593d4a87953cfc928597334a1b664c45557f4 ([1;31muntouched-live-demo-branch[m) : first commit adding training images and labels


In [19]:
del co.datasets['train_images']['0']
del co.datasets['train_labels']['0']
co.commit('removed index zero for train images and labels')

Commit operation requested with message: removed index zero for train images and labels
(6478169, 1570792, 65536)
removing all stage hash records
Commit completed. Commit hash: 395d988b638255ed75bd47e1d416fa75521e398e


'395d988b638255ed75bd47e1d416fa75521e398e'

In [20]:
co.close()

writer checkout of untouched-live-demo-branch closed


In [21]:
co = repo.checkout(write=True, branch_name='master')

In [22]:
co.merge('merge of removals with master', 'untouched-live-demo-branch')

Selected 3-Way Merge Strategy
(9114621, 2201461, 65536)
removing all stage hash records


'31d74893fb42562f26cdb23e818b436426589623'

In [23]:
repo.log()

*   31d74893fb42562f26cdb23e818b436426589623 ([1;31mmaster[m) : merge of removals with master
[1;31m|[m[1;32m\[m  
[1;31m|[m * 395d988b638255ed75bd47e1d416fa75521e398e ([1;31muntouched-live-demo-branch[m) : removed index zero for train images and labels
* [1;32m|[m   3fd5d1fc99f08d901700c0c852a1575d7fc24df7 : adding in the new testing datasets
[1;33m|[m[1;34m\[m [1;32m\[m  
[1;33m|[m * [1;32m|[m 3115bf8611987cc5e661e55bf994423b60683f9c ([1;31madd-test[m) : added testing labels only
[1;33m|[m * [1;32m|[m 7b7f24e64689bda604ae48280cae3877c334dfe1 : added testing images only
[1;33m|[m [1;32m|[m[1;32m/[m  
* [1;32m|[m c2e49c3ef8eff20173f689af803c5d623d72c4fc ([1;31madd-validation[m) : commit adding validation images and labels
[1;32m|[m[1;32m/[m  
* cd5593d4a87953cfc928597334a1b664c45557f4 : first commit adding training images and labels


In [24]:
co.datasets['train_images']


 Hangar DatasetDataWriter                 
    Dataset Name     : train_images                
    Schema UUID      : bf3e41927db611e98cc88c859047adef                
    Schema Hash      : 976ba57033bb                
    Variable Shape   : False                
    (max) Shape      : (784,)                
    Datatype         : <class 'numpy.uint8'>                
    Named Samples    : True                
    Access Mode      : a                
    Num Samples      : 49999


In [28]:
co.metadata['foo'] = 'bar'
co.commit('test for conflicts')

Commit operation requested with message: test for conflicts
(9114671, 2201495, 65536)
removing all stage hash records
Commit completed. Commit hash: 5131376482d70f40c3d1519b09b9cb74fde01379


'5131376482d70f40c3d1519b09b9cb74fde01379'

In [29]:
co.close()

writer checkout of master closed


In [30]:
repo.create_branch('conflictbranch', base_commit='3115bf8611987cc5e661e55bf994423b60683f9c')

'conflictbranch'

In [31]:
co = repo.checkout(write=True, branch_name='conflictbranch')

In [32]:
co.metadata['foo'] = 'world bar baz'

In [33]:
co.commit('hey there this should be a conflict')

Commit operation requested with message: hey there this should be a conflict
(7736555, 1884220, 65536)
removing all stage hash records
Commit completed. Commit hash: ec8691ca9b18cf8a730cbeb9a34157983c45e162


'ec8691ca9b18cf8a730cbeb9a34157983c45e162'

In [36]:
co.close()

writer checkout of conflictbranch closed


In [37]:
co = repo.checkout(write=True, branch_name='master')

In [40]:
repo._details()


Branch
File Size: 32.77 kB

b'branch:add-test' b'3115bf8611987cc5e661e55bf994423b60683f9c'
b'branch:add-validation' b'c2e49c3ef8eff20173f689af803c5d623d72c4fc'
b'branch:conflictbranch' b'ec8691ca9b18cf8a730cbeb9a34157983c45e162'
b'branch:master' b'5131376482d70f40c3d1519b09b9cb74fde01379'
b'branch:untouched-live-demo-branch' b'395d988b638255ed75bd47e1d416fa75521e398e'
b'head' b'branch:master'
b'writerlock:' b'9f07acf7-9f4d-4ead-a60a-b6e96992c16e'

Label
File Size: 28.67 kB

b'h:ace9b3802f7beb30d6cc569dea9a379102d5982e' b'bar'
b'h:d8fa6800caf496e637d965faac1a033e4636c2e6' b'world'
b'h:df4439ce38fccfb32d91537df433f3721ee596eb' b'world bar baz'

HASH
File Size: 22.32 MB

b'h:0000b8a4c5c44ed62bd74f5f434ca73af2b20ad5' b'976ba57033bb 000 $ 77 346 * 784'
b'h:0000daa96823de7a350454a4ee102e2a25db3ef9' b'976ba57033bb 003 $ 0 469 * 784'
b'h:0000ee5d2023878fcc25b727a76532dc5284c8fb' b'976ba57033bb 000 $ 10 83 * 784'
b'h:0003b87e9b75821f8a33216d7c2c944a148ed03a' b'976ba57033bb 000 $ 47 390 * 784'


In [39]:
co.diff.branch('conflictbranch')

({'metadata': {'master': {'additions': {'foo': 'ace9b3802f7beb30d6cc569dea9a379102d5982e'},
    'removals': {},
    'mutations': {},
    'unchanged': {'hello': 'd8fa6800caf496e637d965faac1a033e4636c2e6'}},
   'dev': {'additions': {'foo': 'df4439ce38fccfb32d91537df433f3721ee596eb'},
    'removals': {},
    'mutations': {},
    'unchanged': {'hello': 'd8fa6800caf496e637d965faac1a033e4636c2e6'}}},
  'datasets': {'master': {'additions': {'validation_images': RawDatasetSchemaVal(schema_uuid='ca33c0f47db611e98cc88c859047adef', schema_hash='976ba57033bb', schema_dtype=2, schema_is_var=False, schema_max_shape=(784,), schema_is_named=True),
     'validation_labels': RawDatasetSchemaVal(schema_uuid='ca3418ba7db611e98cc88c859047adef', schema_hash='631f0f57c469', schema_dtype=7, schema_is_var=False, schema_max_shape=(1,), schema_is_named=True)},
    'removals': {},
    'mutations': {},
    'unchanged': {'test_images': RawDatasetSchemaVal(schema_uuid='ce6a45307db611e98cc88c859047adef', schema_hash=

In [38]:
co.merge('error here', 'conflictbranch')

Selected 3-Way Merge Strategy
HANGAR VALUE ERROR:: Merge ABORTED with conflict: {'dset': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'meta': ConflictRecords(t1=('foo',), t21=(), t22=(), t3=(), conflict=True), 'sample': {'test_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'test_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'train_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'train_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'validation_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'validation_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False)}, 'conflict_found': True}


ValueError: HANGAR VALUE ERROR:: Merge ABORTED with conflict: {'dset': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'meta': ConflictRecords(t1=('foo',), t21=(), t22=(), t3=(), conflict=True), 'sample': {'test_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'test_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'train_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'train_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'validation_images': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False), 'validation_labels': ConflictRecords(t1=(), t21=(), t22=(), t3=(), conflict=False)}, 'conflict_found': True}

In [26]:
repo.log()

*   31d74893fb42562f26cdb23e818b436426589623 ([1;31mmaster[m) : merge of removals with master
[1;31m|[m[1;32m\[m  
[1;31m|[m * 395d988b638255ed75bd47e1d416fa75521e398e ([1;31muntouched-live-demo-branch[m) : removed index zero for train images and labels
* [1;32m|[m   3fd5d1fc99f08d901700c0c852a1575d7fc24df7 : adding in the new testing datasets
[1;33m|[m[1;34m\[m [1;32m\[m  
[1;33m|[m * [1;32m|[m 3115bf8611987cc5e661e55bf994423b60683f9c ([1;31madd-test[m) : added testing labels only
[1;33m|[m * [1;32m|[m 7b7f24e64689bda604ae48280cae3877c334dfe1 : added testing images only
[1;33m|[m [1;32m|[m[1;32m/[m  
* [1;32m|[m c2e49c3ef8eff20173f689af803c5d623d72c4fc ([1;31madd-validation[m) : commit adding validation images and labels
[1;32m|[m[1;32m/[m  
* cd5593d4a87953cfc928597334a1b664c45557f4 : first commit adding training images and labels


In [25]:
repo.summary()

Summary of Contents Contained in Data Repository 
 
| Repository Info 
|----------------- 
|  Directory: /Users/rick/projects/tensorwerk/hangar/dev/mnist/__hangar 
|  Disk Usage: 83.19 MB 
 
| Commit Details 
------------------- 
|  Commit: 31d74893fb42562f26cdb23e818b436426589623 
|  Created: Fri May 24 00:03:50 2019 
|  By: Rick Izzo 
|  Email: rick@tensorwerk.com 
|  Message: merge of removals with master 
 
| DataSets 
|----------------- 
|  Number of Named Datasets: 6 
|
|  * Dataset Name: test_images 
|    Num Arrays: 10000 
|    Details: 
|    - schema_uuid: ce6a45307db611e98cc88c859047adef 
|    - schema_hash: 976ba57033bb 
|    - schema_dtype: 2 
|    - schema_is_var: False 
|    - schema_max_shape: (784,) 
|    - schema_is_named: True 
|
|  * Dataset Name: test_labels 
|    Num Arrays: 10000 
|    Details: 
|    - schema_uuid: d1614a907db611e98cc88c859047adef 
|    - schema_hash: 631f0f57c469 
|    - schema_dtype: 7 
|    - schema_is_var: False 
|    - schema_max_shape: (1,) 

In [None]:
rco = repo.checkout(branch_name='add-test')

In [None]:
rco.datasets

In [None]:
rco.metadata

In [None]:
co.close()
rco.close()

In [None]:
repo.log()