In [1]:
import hangar
from hangar import Repository

import numpy as np
import pickle
import gzip
import matplotlib.pyplot as plt

from tqdm import tqdm

In [2]:
hangar.__version__

'0.5.1'

Download mnist from [here](https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz) and save it to a known path.

## Initialize the Repo

In [3]:
!rm -rf mnist/.hangar

In [4]:
repo = Repository('./')
repo.init(user_name='jjmachan', user_email='jjmachan@g.com', remove_old=True)
repo

Hangar Repo initialized at: /home/jjmachan/jjmachan/hangar_tutorial/.hangar




Hangar Repository               
    Repository Path  : /home/jjmachan/jjmachan/hangar_tutorial               
    Writer-Lock Free : True


In [5]:
repo.remote.list_all()

[]

In [6]:
repo

Hangar Repository               
    Repository Path  : /home/jjmachan/jjmachan/hangar_tutorial               
    Writer-Lock Free : True


In [7]:
co = repo.checkout(write=True)
co

Hangar WriterCheckout                
    Writer       : True                
    Base Branch  : master                
    Num Columns  : 0


In [8]:
co

Hangar WriterCheckout                
    Writer       : True                
    Base Branch  : master                
    Num Columns  : 0


In [9]:
list(co.items())

[]

In [10]:
list(co.keys())

[]

## Columns
These are the structures that are used to store the data as numpy arrays. both numeric and string data can be stored.

In [11]:
co.columns

Hangar Columns                
    Writeable         : True                
    Number of Columns : 0                
    Column Names / Partial Remote References:                
      - 

In [12]:

# Load the dataset
with gzip.open('./mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='bytes')

# def rescale(array):
#     array = array * 256
#     rounded = np.round(array)
#     return rounded.astype(np.uint8())

# sample image and label for creating arrayset
sample_trimg = train_set[0][0]
sample_trlabel = np.array([train_set[1][0]])

# training images
trimgs = train_set[0]
trlabels = train_set[1]

data = [train_set, valid_set, test_set]

In [13]:
sample_trimg.shape, sample_trimg.dtype

((784,), dtype('float32'))

In [14]:
# Train
co.add_ndarray_column(name='mnist_training_images', prototype=sample_trimg)
co.add_ndarray_column(name='mnist_training_labels', prototype=sample_trlabel)


Hangar FlatSampleWriter                 
    Column Name              : mnist_training_labels                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : flat                
    Schema Type              : fixed_shape                
    DType                    : int64                
    Shape                    : (1,)                
    Number of Samples        : 0                
    Partial Remote Data Refs : False


In [15]:
# Val
co.add_ndarray_column(name='mnist_validation_images', prototype=sample_trimg)
co.add_ndarray_column(name='mnist_validation_labels', prototype=sample_trlabel)

# Test
co.add_ndarray_column(name='mnist_test_images', prototype=sample_trimg)
co.add_ndarray_column(name='mnist_test_labels', prototype=sample_trlabel)

Hangar FlatSampleWriter                 
    Column Name              : mnist_test_labels                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : flat                
    Schema Type              : fixed_shape                
    DType                    : int64                
    Shape                    : (1,)                
    Number of Samples        : 0                
    Partial Remote Data Refs : False


In [16]:
column_list = [('mnist_training_images', 'mnist_training_labels'), ('mnist_validation_images', 'mnist_validation_labels'), ('mnist_test_images', 'mnist_test_labels')]
for imgs, labels in column_list:
    print(co.columns[imgs], co.columns[labels], '\n')

FlatSampleWriter(repo_pth=/home/jjmachan/jjmachan/hangar_tutorial/.hangar, aset_name=mnist_training_images, ['column_layout=flat, ', 'column_type=ndarray, ', 'schema_hasher_tcode=1, ', 'data_hasher_tcode=0, ', 'schema_type=fixed_shape, ', 'shape=(784,), ', 'dtype=float32, ', 'backend=00, ', "backend_options={'complib': 'blosc:lz4hc', 'complevel': 5, 'shuffle': 'byte'}, "], mode=a) FlatSampleWriter(repo_pth=/home/jjmachan/jjmachan/hangar_tutorial/.hangar, aset_name=mnist_training_labels, ['column_layout=flat, ', 'column_type=ndarray, ', 'schema_hasher_tcode=1, ', 'data_hasher_tcode=0, ', 'schema_type=fixed_shape, ', 'shape=(1,), ', 'dtype=int64, ', 'backend=10, ', 'backend_options={}, '], mode=a) 

FlatSampleWriter(repo_pth=/home/jjmachan/jjmachan/hangar_tutorial/.hangar, aset_name=mnist_validation_images, ['column_layout=flat, ', 'column_type=ndarray, ', 'schema_hasher_tcode=1, ', 'data_hasher_tcode=0, ', 'schema_type=fixed_shape, ', 'shape=(784,), ', 'dtype=float32, ', 'backend=00, ',

In [17]:

for i, (imgs, labels) in enumerate(column_list):
    print(i)
    img_col , label_col = co.columns[imgs], co.columns[labels]
    with img_col, label_col:
        for idx, image in enumerate(data[i][0]):
            img_col[idx] = image
            label_col[idx] = np.array([data[i][1][idx]])

0
1
2


In [18]:
co.columns['mnist_training_images']

Hangar FlatSampleWriter                 
    Column Name              : mnist_training_images                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : flat                
    Schema Type              : fixed_shape                
    DType                    : float32                
    Shape                    : (784,)                
    Number of Samples        : 50000                
    Partial Remote Data Refs : False


In [19]:
co.commit('added all the mnist datasets')

'a=cdef6d59df587623e4767d9fac20b7670f97358d'

In [20]:
repo.log()

* a=cdef6d59df587623e4767d9fac20b7670f97358d ([1;31mmaster[m) : added all the mnist datasets


In [21]:
co['mnist_training_images']

Hangar FlatSampleWriter                 
    Column Name              : mnist_training_images                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : flat                
    Schema Type              : fixed_shape                
    DType                    : float32                
    Shape                    : (784,)                
    Number of Samples        : 50000                
    Partial Remote Data Refs : False


In [22]:
repo.summary()

Summary of Contents Contained in Data Repository 
 
| Repository Info 
|----------------- 
|  Base Directory: /home/jjmachan/jjmachan/hangar_tutorial 
|  Disk Usage: 105.88 MB 
 
| Commit Details 
------------------- 
|  Commit: a=cdef6d59df587623e4767d9fac20b7670f97358d 
|  Created: Fri May  1 10:52:38 2020 
|  By: jjmachan 
|  Email: jjmachan@g.com 
|  Message: added all the mnist datasets 
 
| DataSets 
|----------------- 
|  Number of Named Columns: 6 
|
|  * Column Name: ColumnSchemaKey(column="mnist_test_images", layout="flat") 
|    Num Data Pieces: 10000 
|    Details: 
|    - column_layout: flat 
|    - column_type: ndarray 
|    - schema_hasher_tcode: 1 
|    - data_hasher_tcode: 0 
|    - schema_type: fixed_shape 
|    - shape: (784,) 
|    - dtype: float32 
|    - backend: 00 
|    - backend_options: {'complib': 'blosc:lz4hc', 'complevel': 5, 'shuffle': 'byte'} 
|
|  * Column Name: ColumnSchemaKey(column="mnist_test_labels", layout="flat") 
|    Num Data Pieces: 10000 
|   

In [23]:
co.close()

## Explore Columns

In [56]:
co.columns

Hangar Columns                
    Writeable         : True                
    Number of Columns : 6                
    Column Names / Partial Remote References:                
      - mnist_test_images / False
      - mnist_test_labels / False
      - mnist_training_images / False
      - mnist_training_labels / False
      - mnist_validation_images / False
      - mnist_validation_labels / False

In [59]:
co.add_ndarray_column('test_with_subsamples',
                     shape = (20,20,3),
                     dtype  = int,
                     variable_shape=True,
                     contains_subsamples=True,
                     )

co.add_ndarray_column('test',
                     shape = (20,20,3),
                     dtype  = int,
                     variable_shape=True,
                     contains_subsamples=False,
                     )

co.add_str_column('test_str')

Hangar NestedSampleWriter                 
    Column Name              : test_with_subsamples                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : nested                
    Schema Type              : variable_shape                
    DType                    : int64                
    Shape                    : (20, 20, 3)                
    Number of Samples        : 0                
    Number of Subsamples     : 0                
    Partial Remote Data Refs : True


In [60]:
co['test']

Hangar FlatSampleWriter                 
    Column Name              : test                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : flat                
    Schema Type              : variable_shape                
    DType                    : int64                
    Shape                    : (20, 20, 3)                
    Number of Samples        : 0                
    Partial Remote Data Refs : False


In [61]:
co['test_with_subsamples']

Hangar NestedSampleWriter                 
    Column Name              : test_with_subsamples                
    Writeable                : True                
    Column Type              : ndarray                
    Column Layout            : nested                
    Schema Type              : variable_shape                
    DType                    : int64                
    Shape                    : (20, 20, 3)                
    Number of Samples        : 0                
    Number of Subsamples     : 0                
    Partial Remote Data Refs : True


In [65]:
co['test_str']

Hangar FlatSampleWriter                 
    Column Name              : test_str                
    Writeable                : True                
    Column Type              : str                
    Column Layout            : flat                
    Schema Type              : variable_shape                
    DType                    : <class 'str'>                
    Shape                    : None                
    Number of Samples        : 1                
    Partial Remote Data Refs : False


### Adding data

In [76]:
co['test_str'][1] = 'hai there'

In [79]:
co['test_str', '1']

KeyError: '1'

In [68]:
co['test_with_subsamples'][1] = 

TypeError: cannot convert dictionary update sequence element #0 to a sequence

In [69]:
co.close()