# Advanced NumPy

In [1]:
%pylab inline

import numpy as np
from numpy.random import randn

Populating the interactive namespace from numpy and matplotlib


## ndarray Object Internals

NumPy ndarray 提供了一种将同质数据块解释为多维数组对象的方式

ndarray 内部有以下组成
- 一个指向数组的指针
- 数据类型 dtype
- 一个表示数组形状的元组 tuple
- 一个跨度元组 stride。例如 3x4x5 float64 数组，跨度为 (160, 40, 8)

In [2]:
np.empty((3,4,5), dtype='f8').strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [3]:
ints = np.ones(10, dtype='i2')
floats = np.ones(10, dtype='f4')

In [4]:
np.issubdtype(ints.dtype, np.integer)

True

In [5]:
np.issubdtype(floats.dtype, np.floating)

True

In [6]:
# 调用 dtype 的 mro 方法查看所有父类
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## Advanced Array Manipulation
### Reshaping Arrays

In [7]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [8]:
arr.reshape((4,2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [9]:
# 作为参数的形状其中一维可以是 -1，表示该维度大小有数据本身推断
arr.reshape((-1,2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [10]:
# 数组shape属性是一个 tuple, 可以当参数传入reshape
other_arr = np.ones((3,5))

arr = np.arange(15)
arr = arr.reshape(other_arr.shape)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [11]:
#  ravel 为 reshape 反运算，将数组扁平化
arr1 = arr.ravel()
arr1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [12]:
# flatten 类似 ravel，但返回数据副本
arr2 = arr.flatten()
arr2

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [13]:
arr1[1] = 111
arr2[2] = 222

# 修改arr2(副本)不会改变arr
arr, arr1, arr2

(array([[  0, 111,   2,   3,   4],
        [  5,   6,   7,   8,   9],
        [ 10,  11,  12,  13,  14]]),
 array([  0, 111,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14]),
 array([  0,   1, 222,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14]))

### C versus Fortran Order

In [14]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [15]:
# C: row first
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [16]:
# Fortran: column first
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [17]:
arr1 = np.array([[1,2,3], [4,5,6]])
arr2 = np.array([[7,8,9], [10,11,12]])

# 预设沿着 axis=0 连接
np.concatenate([arr1, arr2])

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [18]:
# 沿着 axis=1 连接
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [19]:
# 等同于 concatenate(axis=0)
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [20]:
# 等同于 concatenate(axis=1)
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [21]:
arr = np.arange(5).reshape((5,1))

# 沿着 axis=0，拆分数组
first, second, third = np.split(arr, [1,3])

In [22]:
first

array([[0]])

In [23]:
second

array([[1],
       [2]])

In [24]:
third

array([[3],
       [4]])

数组连接函数

函数 | 说明
---|---
`concatenate` | 一般化的连接
`vstack`, `row_stack` | 沿轴0进行堆叠
`hstack` | 沿轴1进行堆叠
`column_stack` | 类似 `hstack`, 但是会先将一维数组转换为二维列向量
`dstack` | 以面向“深度”的方式对数组进行堆叠（沿轴2)
`split` | 沿指定轴在指定的位置拆分数组
`hsplit`, `vsplit`, `dsplit` | split便捷化函数，分别沿轴0,1,2进行拆分

In [25]:
arr1 = np.array([1,2,3,4]).reshape((2,2))
arr2 = np.array([5,6,7,8]).reshape((2,2))

In [26]:
vs = np.vstack([arr1, arr2])
vs

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [27]:
hs = np.hstack([arr1, arr2])
hs

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [28]:
np.column_stack([arr1, arr2])

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [29]:
cube = np.dstack([arr1, arr2])
cube

array([[[1, 5],
        [2, 6]],

       [[3, 7],
        [4, 8]]])

In [30]:
for arr in np.vsplit(vs, 2):
    print arr

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [31]:
for arr in np.hsplit(hs, 2):
    print arr

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [32]:
# dstack, dsplit 不是反函数？
for arr in np.dsplit(cube, [2]):
    print arr

[[[1 5]
  [2 6]]

 [[3 7]
  [4 8]]]
[]


#### Stacking helpers: `r_` and `c_`

使数组的堆叠操作更简洁

In [33]:
arr = np.arange(6)
arr1 = arr.reshape((3,2))
arr2 = randn(3,2)

In [34]:
np.r_[arr1, arr2]

array([[  0.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   3.00000000e+00],
       [  4.00000000e+00,   5.00000000e+00],
       [  1.94039185e-01,  -1.76624982e+00],
       [  1.01961712e+00,   2.22813737e+00],
       [ -1.38609441e-03,   2.18065808e+00]])

In [35]:
np.c_[arr1, arr2]

array([[  0.00000000e+00,   1.00000000e+00,   1.94039185e-01,
         -1.76624982e+00],
       [  2.00000000e+00,   3.00000000e+00,   1.01961712e+00,
          2.22813737e+00],
       [  4.00000000e+00,   5.00000000e+00,  -1.38609441e-03,
          2.18065808e+00]])

In [36]:
np.c_[np.r_[arr1, arr2], arr]

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       [  2.00000000e+00,   3.00000000e+00,   1.00000000e+00],
       [  4.00000000e+00,   5.00000000e+00,   2.00000000e+00],
       [  1.94039185e-01,  -1.76624982e+00,   3.00000000e+00],
       [  1.01961712e+00,   2.22813737e+00,   4.00000000e+00],
       [ -1.38609441e-03,   2.18065808e+00,   5.00000000e+00]])

In [37]:
# 将切片翻译成数组
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: Tile and Repeat

因为广播（broadcasting），NumPy 很少需要对数组进行重复（replicate）

In [38]:
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [39]:
# 传入数组，指定重复次数
arr.repeat([2,3,4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [40]:
arr = np.arange(4).reshape((2,2))

# 沿指定轴重复
arr.repeat(2, axis=0)

array([[0, 1],
       [0, 1],
       [2, 3],
       [2, 3]])

In [41]:
arr.repeat(2, axis=1)

array([[0, 0, 1, 1],
       [2, 2, 3, 3]])

In [42]:
# 传入数组，指定重复次数
arr.repeat([2,3], axis=0)

array([[0, 1],
       [0, 1],
       [2, 3],
       [2, 3],
       [2, 3]])

In [43]:
# 沿指定轴，堆叠数组的副本数（像铺瓷砖）
np.tile(arr, 3)

array([[0, 1, 0, 1, 0, 1],
       [2, 3, 2, 3, 2, 3]])

In [44]:
# 第二个参数可以是表示布局的 tuple，
np.tile(arr, (3,2))

array([[0, 1, 0, 1],
       [2, 3, 2, 3],
       [0, 1, 0, 1],
       [2, 3, 2, 3],
       [0, 1, 0, 1],
       [2, 3, 2, 3]])

### Fancy Indexing Equivalents: Take and Put

In [45]:
arr = np.arange(10) * 100

# 透过整数数组使用花式索引
inds = [7,1,2,6]
arr[inds]

array([700, 100, 200, 600])

In [46]:
# 获取单个轴向的选区
arr.take(inds)

array([700, 100, 200, 600])

In [47]:
# 设置单个轴向的选区
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [48]:
arr.put(inds, [11,22,33,44])
arr

array([  0,  22,  33, 300, 400, 500,  44,  11, 800, 900])

In [49]:
inds = [2,0,2,1]
arr = np.arange(2*4).reshape((2,4))

# 沿着 axis=1 选取
arr.take(inds, axis=1)

array([[2, 0, 2, 1],
       [6, 4, 6, 5]])

## Broadcasting

In [50]:
arr = np.arange(5)

# 标量被广播到所有元素上
arr * 4

array([ 0,  4,  8, 12, 16])

In [51]:
arr = randn(4,3)

# 沿轴0，计算平均
means = arr.mean(axis=0)

# 沿轴0，广播
demeaned = arr - means

# 验证
demeaned.mean(0)

array([ -1.38777878e-17,   0.00000000e+00,  -5.55111512e-17])

In [52]:
# 沿轴1，计算平均
row_means = arr.mean(1)

# 沿轴1，广播
demeaned = arr - row_means.reshape((4,1))

# 验证
demeaned.mean(1)

array([  7.40148683e-17,   7.40148683e-17,   0.00000000e+00,
        -1.85037171e-17])

### Broadcasting Over Other Axes

In [53]:
# 以下会引发错误（维度不对）
# arr - arr.mean(1)

arr.shape, arr.mean(1).shape

((4, 3), (4,))

In [54]:
# 根据广播原则，较小数组的“广播维”必须是1。
# 上面的例子，要将平均值的形状由(4,)变成(4,1)
arr - arr.mean(1).reshape((4,1))

array([[-0.74313326,  0.10154811,  0.64158516],
       [ 0.79396576, -0.18512728, -0.60883849],
       [-0.70015315,  1.63267494, -0.93252179],
       [-1.12422722,  1.04394013,  0.08028709]])

#### 使用 np.newaxis 插入新轴

In [55]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [56]:
arr.shape

(3,)

In [57]:
# 插入 axix=1 的新轴
arr_2d = arr[:, np.newaxis]
arr_2d

array([[0],
       [1],
       [2]])

In [58]:
arr_2d.shape

(3, 1)

In [59]:
# 插入 axix=0 的新轴
arr_2d = arr[np.newaxis, :]
arr_2d

array([[0, 1, 2]])

In [60]:
arr_2d.shape

(1, 3)

In [61]:
arr = np.arange(4).reshape((2,2))

# 插入 axix=1 的新轴（很难像像...）
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(2, 1, 2)

In [62]:
arr = randn(3,4,5)

# 沿轴2，计算平均
means_axis2 = arr.mean(2)

# 沿轴2，广播平均值
demeaned = arr - means_axis2[:, :, np.newaxis]

# 验证
demeaned.mean(2)

array([[ -2.22044605e-17,   8.88178420e-17,  -2.22044605e-17,
         -4.44089210e-17],
       [  2.22044605e-17,   1.11022302e-17,  -5.55111512e-18,
         -1.11022302e-17],
       [ -4.44089210e-17,   8.88178420e-17,  -3.33066907e-17,
          0.00000000e+00]])

### Setting Array Values by Broadcasting

In [63]:
arr = np.zeros((4,3))

# 使用索引设置数组
# 使用[:]表示对数组操作，不让会变成赋值
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [64]:
col = np.array([1,2,3,4])

# 沿 axis=1 广播
arr[:] = col[:, np.newaxis]
arr

array([[ 1.,  1.,  1.],
       [ 2.,  2.,  2.],
       [ 3.,  3.,  3.],
       [ 4.,  4.,  4.]])

In [65]:
row = np.array([1,2,3])

# 沿 axis=0 广播
arr[:] = row[np.newaxis, :]
arr

array([[ 1.,  2.,  3.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [66]:
# row 0,1 赋值
arr[:2] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [67]:
# row 0,1 赋值
arr[:2] = [[6],[7]]
arr

array([[ 6.,  6.,  6.],
       [ 7.,  7.,  7.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [68]:
# row 0,1 赋值
arr[:2] = [[2,3,4], [4,5,6]]
arr

array([[ 2.,  3.,  4.],
       [ 4.,  5.,  6.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

## Advanced ufunc Usage
### ufunc Instance Methods

ufunc 的方法

方法 | 说明
---|---
`reduce(x)` | 通过连续执行原始运算的方式进行聚合
`accumulate(x)` | 聚合值，保留所有局部聚合结果
`reduceat(x, bins)` | ”局部“约简（groupby），约简数据的各个切片以产生聚合型数组
`outer(x, y)` | 对x和y中的每个元素应用原始运算，结果数组的形状为 (x.shape, y.shape)。计算两个数组的叉积

In [75]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [72]:
np.add.reduce(arr)

45

In [73]:
np.add.accumulate(arr)

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

In [74]:
np.add.reduceat(arr, [0,5,8]) np.add.reduceat(arr, [0,5,8]) 

array([10, 18, 17])

In [80]:
np.multiply.outer(np.arange(1,10), np.arange(1,10)) # 九九乘法

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 2,  4,  6,  8, 10, 12, 14, 16, 18],
       [ 3,  6,  9, 12, 15, 18, 21, 24, 27],
       [ 4,  8, 12, 16, 20, 24, 28, 32, 36],
       [ 5, 10, 15, 20, 25, 30, 35, 40, 45],
       [ 6, 12, 18, 24, 30, 36, 42, 48, 54],
       [ 7, 14, 21, 28, 35, 42, 49, 56, 63],
       [ 8, 16, 24, 32, 40, 48, 56, 64, 72],
       [ 9, 18, 27, 36, 45, 54, 63, 72, 81]])

### Custom ufuncs

In [81]:
def add_element(x,y):
    return x+y

# frompyfunc: 接受一个Python函数，以及两个分别表示输入、输出参数数量的整数
add_them = np.frompyfunc(add_element, 2, 1)

add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [83]:
arr = randn(1000)

In [84]:
%timeit add_them(arr, arr) # 纯python运算，效率不好

100 loops, best of 3: 1.87 ms per loop


In [85]:
%timeit np.add(arr, arr)

The slowest run took 1089.26 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 945 ns per loop


## Structured and Record Arrays

In [99]:
# 结构化数组是一种特殊的 ndarray (类似C语言给的struct)
dtype = [('x', np.float64), ('y', np.int32)]

sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [101]:
# 字段名保存在dtype.names属性中
sarr.dtype.names

('x', 'y')

### Nested dtypes and Multidimensional Fields

In [102]:
# x 字段是一个长度为3的数组
dtype = [('x', np.float64, 3), ('y', np.int32)]

arr = np.zeros(4, dtype=dtype)
arr

array([([0.0, 0.0, 0.0], 0), ([0.0, 0.0, 0.0], 0), ([0.0, 0.0, 0.0], 0),
       ([0.0, 0.0, 0.0], 0)], 
      dtype=[('x', '<f8', (3,)), ('y', '<i4')])

In [103]:
arr[0]['x']

array([ 0.,  0.,  0.])

In [104]:
arr['x']

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [107]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', 'i4')]

data = np.array([((1, 2), 5), ((3, 4), 6)], dtype = dtype)
data

array([((1.0, 2.0), 5), ((3.0, 4.0), 6)], 
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [112]:
data['y']

array([5, 6], dtype=int32)

In [113]:
data['x']['a']

array([ 1.,  3.])

### Why Use Structured Arrays?

NumPy 结构化数组每个元素在内存中被表示为固定的字节数，可以非常快速高效读写

### Structured Array Manipulations: numpy.lib.recfunctions

NumPy 模块 numpy.lib.recfuncitons 中有一些用于增删字段或执行基本连接运算的工具

## More About Sorting

In [114]:
arr = randn(6)

# 数组内容重新排列，不会产生新数组
arr.sort()
arr

array([-0.71861215, -0.6008828 , -0.38600662,  0.11908311,  0.29895309,
        1.03613883])

In [116]:
arr = randn(3,4)
arr

array([[-1.2229256 , -1.31766035,  1.76160977,  0.07291096],
       [-1.02111402, -0.0451849 ,  0.92260208, -0.32976512],
       [-1.8395554 ,  0.56494144,  0.471448  ,  0.58933423]])

In [117]:
arr[:, 0].sort() # sort first column values in-place
arr

array([[-1.8395554 , -1.31766035,  1.76160977,  0.07291096],
       [-1.2229256 , -0.0451849 ,  0.92260208, -0.32976512],
       [-1.02111402,  0.56494144,  0.471448  ,  0.58933423]])

In [124]:
arr = np.array([1,3,5,2,4])

# numpy.sort 会创建一个已排序副本
np.sort(arr), arr

(array([1, 2, 3, 4, 5]), array([1, 3, 5, 2, 4]))

In [126]:
arr = np.array([[6,4,2],[5,3,1]])

# 沿轴1排序
arr.sort(axis=1)
arr

array([[2, 4, 6],
       [1, 3, 5]])

In [127]:
# 沿轴0排序
arr.sort(axis=0)
arr

array([[1, 3, 5],
       [2, 4, 6]])

In [128]:
# sort 排序不可设置为降序。但数组切片会产生视图，values[::-1]可以返回反序列表
arr[:, ::-1]

array([[5, 3, 1],
       [6, 4, 2]])

### Indirect Sorts: argsort and lexsort

In [129]:
values = np.array([5,0,1,3,2])

# 取得排序数据的索引数组
indexer = values.argsort()
values[indexer]

array([0, 1, 2, 3, 5])

In [130]:
arr = randn(3,5)
arr[0] = values
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [ 1.03695498, -1.59280187, -0.77651149,  0.80749814,  1.48324287],
       [-1.72937855,  0.71768437, -0.27148062, -0.07127078,  0.27291848]])

In [132]:
# 根据 row 0 数组排序 columns
arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [-1.59280187, -0.77651149,  1.48324287,  0.80749814,  1.03695498],
       [ 0.71768437, -0.27148062,  0.27291848, -0.07127078, -1.72937855]])

In [134]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])

# lexsort 可以一次性对多个键数组执行间接排序
# 键的应用顺序是从最后一个传入的算起
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### Alternate Sort Algorithms

数组排序算法

kind | 速度 | 稳定性 | 工作空间 | 最坏情况
---|---|---|---|---
`quicksort` | 1 | 否 | 0 | O(n<sup>2</sup>)
`mergesort` | 2 | 是 | n/2 | O(n log n) 
`heapsort`  | 3 | 否 | 0 | O(n log n)

In [138]:
arr = np.array([2,2,1,1,1])

# mergesort 会保持等价元素的相对位置
arr.argsort(kind='mergesort')

array([2, 3, 4, 0, 1])

### numpy.searchsorted: Finding elements in a Sorted Array

In [140]:
arr = np.array([0,1,7,12,15])

# 返回插值位置，插值后数组依然维持有序性
arr.searchsorted(9)

3

In [141]:
# 返回一组索引
arr.searchsorted([0,8,11,16])

array([0, 3, 3, 5])

In [142]:
arr = np.array([0,0,0,1,1,1,1])

# 返回等值组左侧索引
arr.searchsorted([0,1])

array([0, 3])

In [143]:
# 返回等值组右侧索引
arr.searchsorted([0,1], side='right')

array([3, 7])