# Advanced NumPy

In [2]:
%pylab inline

import numpy as np
from numpy.random import randn

Populating the interactive namespace from numpy and matplotlib


## ndarray Object Internals

NumPy ndarray 提供了一种将同质数据块解释为多维数组对象的方式

ndarray 内部有以下组成
- 一个指向数组的指针
- 数据类型 dtype
- 一个表示数组形状的元组 tuple
- 一个跨度元组 stride。例如 3x4x5 float64 数组，跨度为 (160, 40, 8)

In [3]:
np.empty((3,4,5), dtype='f8').strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [4]:
ints = np.ones(10, dtype='i2')
floats = np.ones(10, dtype='f4')

In [6]:
np.issubdtype(ints.dtype, np.integer)

True

In [7]:
np.issubdtype(floats.dtype, np.floating)

True

In [8]:
# 调用 dtype 的 mro 方法查看所有父类
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## Advanced Array Manipulation
### Reshaping Arrays

In [10]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [11]:
arr.reshape((4,2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [14]:
# 作为参数的形状其中一维可以是 -1，表示该维度大小有数据本身推断
arr.reshape((-1,2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [30]:
# 数组shape属性是一个 tuple, 可以当参数传入reshape
other_arr = np.ones((3,5))

arr = np.arange(15)
arr = arr.reshape(other_arr.shape)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [31]:
#  ravel 为 reshape 反运算，将数组扁平化
arr1 = arr.ravel()
arr1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [32]:
# flatten 类似 ravel，但返回数据副本
arr2 = arr.flatten()
arr2

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [33]:
arr1[1] = 111
arr2[2] = 222

# 修改arr2(副本)不会改变arr
arr, arr1, arr2

(array([[  0, 111,   2,   3,   4],
        [  5,   6,   7,   8,   9],
        [ 10,  11,  12,  13,  14]]),
 array([  0, 111,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14]),
 array([  0,   1, 222,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14]))

### C versus Fortran Order

In [34]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [35]:
# C: row first
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [36]:
# Fortran: column first
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [37]:
arr1 = np.array([[1,2,3], [4,5,6]])
arr2 = np.array([[7,8,9], [10,11,12]])

# 预设沿着 axis=0 连接
np.concatenate([arr1, arr2])

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [38]:
# 沿着 axis=1 连接
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [39]:
# 等同于 concatenate(axis=0)
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [40]:
# 等同于 concatenate(axis=1)
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [53]:
arr = np.arange(5).reshape((5,1))

# 沿着 axis=0，拆分数组
first, second, third = np.split(arr, [1,3])

In [50]:
first

array([[0]])

In [51]:
second

array([[1],
       [2]])

In [52]:
third

array([[3],
       [4]])

数组连接函数

函数 | 说明
---|---
`concatenate` | 一般化的连接
`vstack`, `row_stack` | 沿轴0进行堆叠
`hstack` | 沿轴1进行堆叠
`column_stack` | 类似 `hstack`, 但是会先将一维数组转换为二维列向量
`dstack` | 以面向“深度”的方式对数组进行堆叠（沿轴2)
`split` | 沿指定轴在指定的位置拆分数组
`hsplit`, `vsplit`, `dsplit` | split便捷化函数，分别沿轴0,1,2进行拆分

In [69]:
arr1 = np.array([1,2,3,4]).reshape((2,2))
arr2 = np.array([5,6,7,8]).reshape((2,2))

In [72]:
vs = np.vstack([arr1, arr2])
vs

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [73]:
hs = np.hstack([arr1, arr2])
hs

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [66]:
np.column_stack([arr1, arr2])

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [89]:
cube = np.dstack([arr1, arr2])
cube

array([[[1, 5],
        [2, 6]],

       [[3, 7],
        [4, 8]]])

In [84]:
for arr in np.vsplit(vs, 2):
    print arr

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [83]:
for arr in np.hsplit(hs, 2):
    print arr

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [91]:
# dstack, dsplit 不是反函数？
for arr in np.dsplit(cube, [2]):
    print arr

[[[1 5]
  [2 6]]

 [[3 7]
  [4 8]]]
[]


#### Stacking helpers: `r_` and `c_`

使数组的堆叠操作更简洁

In [92]:
arr = np.arange(6)
arr1 = arr.reshape((3,2))
arr2 = randn(3,2)

In [93]:
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-0.28911353,  0.08823438],
       [ 1.37626805, -1.54825152],
       [ 1.76923133, -0.30563579]])

In [94]:
np.c_[arr1, arr2]

array([[ 0.        ,  1.        , -0.28911353,  0.08823438],
       [ 2.        ,  3.        ,  1.37626805, -1.54825152],
       [ 4.        ,  5.        ,  1.76923133, -0.30563579]])

In [95]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-0.28911353,  0.08823438,  3.        ],
       [ 1.37626805, -1.54825152,  4.        ],
       [ 1.76923133, -0.30563579,  5.        ]])

In [96]:
# 将切片翻译成数组
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: Tile and Repeat

因为广播（broadcasting），NumPy 很少需要对数组进行重复（replicate）

In [97]:
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [99]:
# 传入数组，指定重复次数
arr.repeat([2,3,4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [100]:
arr = np.arange(4).reshape((2,2))

# 沿指定轴重复
arr.repeat(2, axis=0)

array([[0, 1],
       [0, 1],
       [2, 3],
       [2, 3]])

In [101]:
arr.repeat(2, axis=1)

array([[0, 0, 1, 1],
       [2, 2, 3, 3]])

In [103]:
# 传入数组，指定重复次数
arr.repeat([2,3], axis=0)

array([[0, 1],
       [0, 1],
       [2, 3],
       [2, 3],
       [2, 3]])

In [108]:
# 沿指定轴，堆叠数组的副本数（像铺瓷砖）
np.tile(arr, 3)

array([[0, 1, 0, 1, 0, 1],
       [2, 3, 2, 3, 2, 3]])

In [109]:
# 第二个参数可以是表示布局的 tuple，
np.tile(arr, (3,2))

array([[0, 1, 0, 1],
       [2, 3, 2, 3],
       [0, 1, 0, 1],
       [2, 3, 2, 3],
       [0, 1, 0, 1],
       [2, 3, 2, 3]])

### Fancy Indexing Equivalents: Take and Put

In [111]:
arr = np.arange(10) * 100

# 透过整数数组使用花式索引
inds = [7,1,2,6]
arr[inds]

array([700, 100, 200, 600])

In [112]:
# 获取单个轴向的选区
arr.take(inds)

array([700, 100, 200, 600])

In [114]:
# 设置单个轴向的选区
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [115]:
arr.put(inds, [11,22,33,44])
arr

array([  0,  22,  33, 300, 400, 500,  44,  11, 800, 900])

In [116]:
inds = [2,0,2,1]
arr = np.arange(2*4).reshape((2,4))

# 沿着 axis=1 选取
arr.take(inds, axis=1)

array([[2, 0, 2, 1],
       [6, 4, 6, 5]])

## Broadcasting

In [160]:
arr = np.arange(5)

# 标量被广播到所有元素上
arr * 4

array([ 0,  4,  8, 12, 16])

In [161]:
arr = randn(4,3)

# 沿轴0，计算平均
means = arr.mean(axis=0)

# 沿轴0，广播
demeaned = arr - means

# 验证
demeaned.mean(0)

array([  5.55111512e-17,   5.55111512e-17,   0.00000000e+00])

In [162]:
# 沿轴1，计算平均
row_means = arr.mean(1)

# 沿轴1，广播
demeaned = arr - row_means.reshape((4,1))

# 验证
demeaned.mean(1)

array([  3.70074342e-17,   3.70074342e-17,   5.55111512e-17,
        -2.77555756e-17])

### Broadcasting Over Other Axes

In [163]:
# 以下会引发错误（维度不对）
# arr - arr.mean(1)

arr.shape, arr.mean(1).shape

((4, 3), (4,))

In [164]:
# 根据广播原则，较小数组的“广播维”必须是1。
# 上面的例子，要将平均值的形状由(4,)变成(4,1)
arr - arr.mean(1).reshape((4,1))

array([[-1.44819875,  0.53620032,  0.91199843],
       [-0.77936181,  0.55350459,  0.22585722],
       [ 0.42915195, -0.09571864, -0.33343331],
       [ 1.24883726, -1.10932064, -0.13951662]])

#### 使用 np.newaxis 插入新轴

In [165]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [166]:
arr.shape

(3,)

In [167]:
# 插入 axix=1 的新轴
arr_2d = arr[:, np.newaxis]
arr_2d

array([[0],
       [1],
       [2]])

In [168]:
arr_2d.shape

(3, 1)

In [169]:
# 插入 axix=0 的新轴
arr_2d = arr[np.newaxis, :]
arr_2d

array([[0, 1, 2]])

In [170]:
arr_2d.shape

(1, 3)

In [172]:
arr = np.arange(4).reshape((2,2))

# 插入 axix=1 的新轴（很难像像...）
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(2, 1, 2)

In [174]:
arr = randn(3,4,5)

# 沿轴2，计算平均
means_axis2 = arr.mean(2)

# 沿轴2，广播平均值
demeaned = arr - means_axis2[:, :, np.newaxis]

# 验证
demeaned.mean(2)

array([[  4.44089210e-17,  -2.22044605e-17,   8.88178420e-17,
         -4.44089210e-17],
       [  2.22044605e-17,  -5.55111512e-17,   1.11022302e-17,
          2.22044605e-17],
       [  8.88178420e-17,   1.11022302e-17,   4.44089210e-17,
          0.00000000e+00]])

### Setting Array Values by Broadcasting

In [180]:
arr = np.zeros((4,3))

# 使用索引设置数组
# 使用[:]表示对数组操作，不让会变成赋值
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [182]:
col = np.array([1,2,3,4])

# 沿 axis=1 广播
arr[:] = col[:, np.newaxis]
arr

array([[ 1.,  1.,  1.],
       [ 2.,  2.,  2.],
       [ 3.,  3.,  3.],
       [ 4.,  4.,  4.]])

In [184]:
row = np.array([1,2,3])

# 沿 axis=0 广播
arr[:] = row[np.newaxis, :]
arr

array([[ 1.,  2.,  3.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [187]:
# row 0,1 赋值
arr[:2] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [189]:
# row 0,1 赋值
arr[:2] = [[6],[7]]
arr

array([[ 6.,  6.,  6.],
       [ 7.,  7.,  7.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [190]:
# row 0,1 赋值
arr[:2] = [[2,3,4], [4,5,6]]
arr

array([[ 2.,  3.,  4.],
       [ 4.,  5.,  6.],
       [ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

## Advanced ufunc Usage
### ufunc Instance Methods

In [195]:
arr = np.arange(10)

# reduce 接受数组，通过一系列个二元运算对其值进行聚合
np.add.reduce(arr), arr.sum()

(45, 45)