In [1]:
import data_vault

In [2]:
!rm -rf data  # just to have the examples clean

In [3]:
%open_vault -p data/storage.zip --secure False

In [4]:
from pandas import DataFrame
from random import choice, randint, seed
seed(0)  # make the examples reproducible
cities = ['London', 'Delhi', 'Tokyo', 'Lagos', 'Warsaw', 'Chongqing']
salaries = DataFrame([
    {'salary': randint(0, 100), 'city': choice(cities)}
    for i in range(10000)
])

In [5]:
salaries.head(3)

Unnamed: 0,salary,city
0,49,Lagos
1,5,Tokyo
2,65,Lagos


## Examples

### Store and import

In [6]:
%vault store salaries in datasets

'Stored salaries (None → 6DE33062) at Sunday, 08. Dec 2019 16:56'

In [7]:
%vault import salaries from datasets as salaries_dataset

Reduced memory usage by 87.27%, from 0.79 MB to 0.10 MB.


'Imported salaries_dataset (6DE33062) at Sunday, 08. Dec 2019 16:56'

In [8]:
salaries_dataset.head(3)

Unnamed: 0,salary,city
0,49,Lagos
1,5,Tokyo
2,65,Lagos


In [9]:
salaries_by_age = salaries.assign(age=salaries.index.map(lambda x: randint(0, 100)))

In [10]:
salaries_by_age.head(3)

Unnamed: 0,salary,city,age
0,49,Lagos,62
1,5,Tokyo,55
2,65,Lagos,90


In [11]:
%vault store salaries_by_age in datasets

'Stored salaries_by_age (None → 80A8BBD8) at Sunday, 08. Dec 2019 16:56'

In [12]:
%vault import datasets

'Imported  at Sunday, 08. Dec 2019 16:56'

In [13]:
# uncomment the line below and press <tab> with cursor after the dot (.)
# to see how the autcompletion works on imported folders:

# datasets.

In [14]:
datasets.salaries.head(3)

Reduced memory usage by 87.27%, from 0.79 MB to 0.10 MB.


Unnamed: 0,salary,city
0,49,Lagos
1,5,Tokyo
2,65,Lagos


In [15]:
datasets.salaries_by_age.head(3)

Reduced memory usage by 87.29%, from 0.87 MB to 0.11 MB.


Unnamed: 0,salary,city,age
0,49,Lagos,62
1,5,Tokyo,55
2,65,Lagos,90


To display all members in code:

In [16]:
dir(datasets)

['salaries', 'salaries_by_age']

### Assert

In [17]:
%vault assert salaries in datasets is 6DE33062

'Verified datasets/salaries (6DE33062) at Sunday, 08. Dec 2019 16:56'

### Delete

In [18]:
%vault del salaries from datasets

'Deleted datasets/salaries (6DE33062) at Sunday, 08. Dec 2019 16:56'

In [19]:
dir(datasets)

['salaries_by_age']

### Access logs

In [20]:
!zcat data/storage.zip.vault.log.gz

{"action": "store", "result": [{"new_file": {"crc32": "6DE33062", "sha256": "12F163211180F5E362D82D3EDA98E4DE78ED5F95EEC49DCBE3A7CB79B6708F5C"}, "old_file": {"crc32": null, "sha256": null}, "subject": "salaries"}], "started": "2019-12-08T16:56:52.173012", "finished": "2019-12-08T16:56:52.483703", "finished_human_readable": "Sunday, 08. Dec 2019 16:56", "command": "store salaries in datasets"}
{"action": "import", "result": [{"new_file": {"crc32": "6DE33062", "sha256": "12F163211180F5E362D82D3EDA98E4DE78ED5F95EEC49DCBE3A7CB79B6708F5C"}, "subject": "salaries_dataset"}], "started": "2019-12-08T16:56:52.492993", "finished": "2019-12-08T16:56:52.561010", "finished_human_readable": "Sunday, 08. Dec 2019 16:56", "command": "import salaries from datasets as salaries_dataset"}
{"action": "store", "result": [{"new_file": {"crc32": "80A8BBD8", "sha256": "13AA1ECD7C14187B98D7ADC7DF1F64AA743036323A7879A07F0730D4A973CD98"}, "old_file": {"crc32": null, "sha256": null}, "subject": "salaries_by_age"}

### Encrypted storage

Normally you would want to keep the definition of your key in a secure location, not in the notebook itself, but just for the sake of demonstration:

In [21]:
%env STORAGE_KEY=SECRET_PASSWORD

env: STORAGE_KEY=SECRET_PASSWORD


In [22]:
%open_vault -p data/encrypted_storage.zip --encryption_variable STORAGE_KEY

In [23]:
%vault store salaries_by_age in encrypted_datasets

'Stored salaries_by_age (None → 80A8BBD8) at Sunday, 08. Dec 2019 16:56'

In [24]:
!ls data

encrypted_storage.zip		    storage.zip
encrypted_storage.zip.vault.log.gz  storage.zip.vault.log.gz


In [25]:
from zipfile import ZipFile

with ZipFile('data/encrypted_storage.zip') as zip_archive:
    try:
        with zip_archive.open('encrypted_datasets/salaries_by_age') as f:
            pass
    except Exception as e:
        print(e)

File 'encrypted_datasets/salaries_by_age' is encrypted, password required for extraction


## Reference

In [26]:
print(data_vault.VaultMagics.__doc__)

The `%vault` magic provides a reproducible caching mechanism for variables exchange between notebooks.

    To open the vault use `%open_vault` magic.
    


In [27]:
print(data_vault.VaultMagics.open_vault.__doc__)

Open a zip archive for the vault. Once opened, all subsequent `%vault` magics operate on this archive.

Open vault arguments:

	 --path, -p, default storage.zip
	 --encryption_variable, -e, default None
	 --secure, -s, default True
	 --optimize_df, -o, default True
	 --timestamp, -t, default True
	 --metadata, -m, default True
	 --logs_path, -l, default {path}.vault.log.gz
	 --gzip_logs, -g, default True


In [28]:
print(data_vault.VaultMagics.vault.__doc__)

Perform one of the available actions, print the description and save metadata in the cell.

Vault commands:

# store
	 - store <one_or_many_variables> in <module> as <valid_id> [with <function>]
	 - store <one_or_many_variables> in <module> [with <function> as <valid_id>]
	 - store <one_variable> in <path> [with <function>]
# import
	 - import <valid_id> from <module> as <valid_id> [with <function>]
	 - import <one_or_many_valid_id> from <module> [with <function> as <valid_id>]
	 - import <path> as <valid_id> [with <function>]
	 - import <module> [as <valid_id>]
# del
	 - del <one_variable> from <module> []
	 - del <path> []
# assert
	 - assert <one_variable> in <module> is <hash> [with <hash_method>]
	 - assert <path> is <hash> [with <hash_method>]


Optional arguments are contained within square brackets [].

Parameters are highlighted with angled brackets <>.