In [1]:
import numpy as np

# Data Serialization: Converting Data Structures to a String of Bytes for Storage

- File Size Calculations, understanding dtype and the array format
- Thinking about the difference between memory formats and serialization formats


# Text

## Variable-Length Text

| Code | Description | 
| :-- | :-- |
| `x.tofile(fname, sep)` | |
| `x.tofile(fname, sep, fmt)` | |
| `np.fromfile(fname, sep, dtype)` | |

  - Format Codes

## 

In [3]:
%mkdir data2
x = np.arange(10)
x.tofile("data2/x5_1.dat", sep=',', format="%2d")
np.fromfile("data/x5_1.dat", sep=',', dtype=np.int32)


A subdirectory or file data2 already exists.
  np.fromfile("data/x5_1.dat", sep=',', dtype=np.int32)


array([0])

# Binary Flat Files

## Saving Single Arrays to Disk

(Copy to Memory and Memory Mapping)

| Code | Description | 
| :-- | :-- |
| `x.tobytes()` |  |
| `x.tofile(fname)` |  |
| `np.fromfile(fname, dtype)` |  |
| `np.fromfile(fname, dtype).reshape()` |  |
| `x.tofile(fname)` |  |
| `y = np.memmap(fname, dtype, shape)` |  |
| `del y; import gc; gc.collect()` |  |

- astype

In [17]:
x = np.arange(2)
x.tofile("data2/x5_1.dat")
np.fromfile("data2/x5_1.dat", dtype=np.int32)
np.fromfile("data2/x5_1.dat", dtype=np.uint8)

array([0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

In [13]:
x.tobytes()

b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x05\x00\x00\x00\x06\x00\x00\x00\x07\x00\x00\x00\x08\x00\x00\x00\t\x00\x00\x00'

Multidimensional Arrays: need to store the shape

In [19]:
x = np.arange(12).reshape(3, 4)
x.tofile("data2/x6.dat")
np.fromfile("data2/x6.dat", dtype=np.int32).reshape(3, 4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [None]:
x = np.arange(12).reshape(3, 4)
x.tofile("data2/xx3.npy")


In [None]:
y = np.memmap("data2/xx3.dat", dtype=np.int32, shape=(3, 4))
y


memmap([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [83]:
del y

## Self-Describing Binary Files with NPY: Putting the File Format Metadata into the File itself

| Code | Description |
| :-- | :-- |
| `np.save()` | |
| `np.load()` | |

In [26]:
np.save( "data2/xx4.npy", np.arange(12).reshape(3, 4))
np.load("data2/xx4.npy")

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

## NPY and NPZ Files

| Code | Description |
| :-- | :-- |
| `np.save(fname, x)` | Save to NPY file |
| `np.savez(fname, **vars)` | Save to NPZ file |
| `np.load(fname`) | Load from either NPY or NPZ file |

## (extra) Self-Describing Binary Flat Files: Using the Filename to store Format Metadata

| Code | Description |
| :-- | :-- |
| `a, b, *c = [1, 2, 3, 4, 5]` |  |
| `str.split()` |  |
| `np.array(x).astype(dtype)` |   |
| `getattr(np, name)` |  |
| `x.reshape()` |  |
| `np.fromfile(fname, dtype).reshape(shape)`  |   |

In [39]:
x = np.arange(20, dtype=np.int32).reshape(4, 5)
fname = "name_int32_4_5.dat"
x.tofile(fname)

name, dtype, *shape = fname.split('.')[0].split('_')
shape = np.array(shape).astype(int)
np.fromfile(fname, dtype=getattr(np, dtype)).reshape(shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

## Inserting Metadata into Numpy Files

  - Single-item arrays
  - Problems with Pickling
  - Pre-Serializing Data

In [None]:
mdata = {'author': 'Emma', 'age': 1.5}
np.savez('data2/ee.npz', mdata=mdata)
np.load('data2/ee.npz', allow_pickle=True)['mdata'].item()

In [None]:
np.savez('data2/ee.npz', **mdata)
dict(np.load('data2/ee.npz'))
np.load('data2/ee.npz')['author'].item()

'Emma'

In [None]:
import json
np.savez('data2/ee2.npz', mdata=json.dumps(mdata))
json.loads(np.load('data2/ee2.npz')['mdata'].item())

{'author': 'Emma', 'age': 1.5}