In [1]:
import numpy as np
import pandas as pd

#### Structured arrays and Record arrays

Provide efficient storage for compound heterogeneous data types

In [2]:
name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25, 45, 37, 19]
weight = [55.0, 85.5, 68.0, 61.5]

In [4]:
x = np.zeros(4, dtype=int)

In [8]:
# Use a compound data type for structured arrays
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'),
                          'formats':('U10', 'i4', 'f8')})
print(data)
print('\n', data.dtype)

[('', 0, 0.) ('', 0, 0.) ('', 0, 0.) ('', 0, 0.)]

 [('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


In [9]:
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

[('Alice', 25, 55. ) ('Bob', 45, 85.5) ('Cathy', 37, 68. )
 ('Doug', 19, 61.5)]


In [19]:
# we can now refer to values by index or name
print(data['name'])
print(data[0])

['Alice' 'Bob' 'Cathy' 'Doug']
('Alice', 25, 55.)


In [20]:
print(data[-1]['name'])

Doug


In [22]:
# more advanced indexing with masks
data[data['age'] < 30]['name']

array(['Alice', 'Doug'], dtype='<U10')

#### Multiple Ways to Define Compount Structures

In [23]:
np.dtype({'names':('name', 'age', 'weight'),
          'formats':('U10', 'i4', 'f8')})

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [24]:
np.dtype({'names':('name', 'age', 'weight'),
          'formats':((np.str_, 10), int, np.float32)})

dtype([('name', '<U10'), ('age', '<i8'), ('weight', '<f4')])

In [26]:
np.dtype([('name', 'S10'), ('age', 'i4'), ('weight', 'f8')])

dtype([('name', 'S10'), ('age', '<i4'), ('weight', '<f8')])

In [27]:
np.dtype('S10,i4,f8')

dtype([('f0', 'S10'), ('f1', '<i4'), ('f2', '<f8')])

#### Data Type Characters

- 'b' -- Byte
- 'i' -- Signed Int
- 'u' -- Unsigned Int
- 'f' -- Float
- 'c' -- Complex float
- 'S' -- String
- 'U' -- Unicode string
- 'V' -- Raw data

#### More Advanced Compound Types

In [28]:
tp = np.dtype([('id', 'i8'), ('mat', 'f8', (3, 3))])
X = np.zeros(1, dtype=tp)
print(X[0])
print(X['mat'][0])

(0, [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


#### Record Arrays

In [29]:
data['age']

array([25, 45, 37, 19], dtype=int32)

In [30]:
data_rec = data.view(np.recarray)
data_rec.age

array([25, 45, 37, 19], dtype=int32)

In [31]:
# record arrays require overhead and are more expensive
%timeit data['age']
%timeit data_rec['age']
%timeit data_rec.age

99.9 ns ± 1.29 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
2.5 µs ± 29.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
3.15 µs ± 21.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
