# 第12章 NumPy高级应用

## ndarray对象的内部机理

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [3]:
np.ones((10, 5)).shape

(10, 5)

In [4]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy数据类型体系

In [5]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

In [6]:
np.issubdtype(ints.dtype, np.integer)

True

In [7]:
np.issubdtype(floats.dtype, np.floating)

True

In [8]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## 高级数组操作

### 数组重塑

In [9]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [11]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [12]:
arr = np.arange(15)

In [13]:
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [15]:
other_arr = np.ones((3, 5))
other_arr.shape

(3, 5)

In [16]:
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [17]:
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [18]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [19]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C和Fortran顺序

In [20]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [21]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [22]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### 数组的合并和拆分

In [23]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

In [24]:
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [25]:
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [26]:
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [27]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [28]:
from numpy.random import randn

In [30]:
arr = randn(5, 2)
arr

array([[ 1.0072, -1.2962],
       [ 0.275 ,  0.2289],
       [ 1.3529,  0.8864],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

In [31]:
first, second, third = np.split(arr, [1, 3])
first

array([[ 1.0072, -1.2962]])

In [32]:
second

array([[0.275 , 0.2289],
       [1.3529, 0.8864]])

In [33]:
third

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

#### 堆叠辅助类：r_和c_

In [34]:
arr = np.arange(6)

In [35]:
arr1 = arr.reshape((3, 2))
arr2 = randn(3, 2)

In [36]:
np.r_[arr1, arr2]

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [-0.5397,  0.477 ],
       [ 3.2489, -1.0212],
       [-0.5771,  0.1241]])

In [37]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [-0.5397,  0.477 ,  3.    ],
       [ 3.2489, -1.0212,  4.    ],
       [-0.5771,  0.1241,  5.    ]])

In [38]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### 元素的重复操作：title和repeat

In [39]:
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [40]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [42]:
arr = randn(2, 2)
arr

array([[-0.7135, -0.8312],
       [-2.3702, -1.8608]])

In [43]:
arr.repeat(2, axis=0)

array([[-0.7135, -0.8312],
       [-0.7135, -0.8312],
       [-2.3702, -1.8608],
       [-2.3702, -1.8608]])

In [44]:
arr.repeat([2, 3], axis=0)

array([[-0.7135, -0.8312],
       [-0.7135, -0.8312],
       [-2.3702, -1.8608],
       [-2.3702, -1.8608],
       [-2.3702, -1.8608]])

In [45]:
arr.repeat([2, 3], axis=1)

array([[-0.7135, -0.7135, -0.8312, -0.8312, -0.8312],
       [-2.3702, -2.3702, -1.8608, -1.8608, -1.8608]])

In [46]:
arr

array([[-0.7135, -0.8312],
       [-2.3702, -1.8608]])

In [47]:
np.tile(arr, 2)

array([[-0.7135, -0.8312, -0.7135, -0.8312],
       [-2.3702, -1.8608, -2.3702, -1.8608]])

In [48]:
arr

array([[-0.7135, -0.8312],
       [-2.3702, -1.8608]])

In [49]:
np.tile(arr, (2, 1))

array([[-0.7135, -0.8312],
       [-2.3702, -1.8608],
       [-0.7135, -0.8312],
       [-2.3702, -1.8608]])

In [50]:
np.tile(arr, (3, 2))

array([[-0.7135, -0.8312, -0.7135, -0.8312],
       [-2.3702, -1.8608, -2.3702, -1.8608],
       [-0.7135, -0.8312, -0.7135, -0.8312],
       [-2.3702, -1.8608, -2.3702, -1.8608],
       [-0.7135, -0.8312, -0.7135, -0.8312],
       [-2.3702, -1.8608, -2.3702, -1.8608]])

### 花式索引的等价函数：take和put

In [51]:
arr = np.arange(10) * 100

In [52]:
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [53]:
arr.take(inds)

array([700, 100, 200, 600])

In [54]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [56]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [57]:
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr

array([[-0.8608,  0.5601, -1.2659,  0.1198],
       [-1.0635,  0.3329, -2.3594, -0.1995]])

In [58]:
arr.take(inds, axis=1)

array([[-1.2659, -0.8608, -1.2659,  0.5601],
       [-2.3594, -1.0635, -2.3594,  0.3329]])

In [59]:
arr = randn(1000, 50)
inds = np.random.permutation(1000)[:500]

In [60]:
%timeit arr[inds]

The slowest run took 8.69 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 28.5 µs per loop


In [61]:
%timeit arr.take(inds, axis=0)

The slowest run took 8.28 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 24.2 µs per loop


## 广播

In [62]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [63]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [65]:
arr = randn(4, 3)
arr.mean(0)

array([-0.3189,  0.876 , -0.2425])

In [66]:
demeaned = arr - arr.mean(0)
demeaned

array([[ 0.5113, -0.245 ,  1.2217],
       [-1.5325,  0.1133, -0.0899],
       [-0.7542, -0.7432, -0.6367],
       [ 1.7754,  0.875 , -0.4951]])

In [67]:
demeaned.mean(0)

array([ 0., -0.,  0.])

In [68]:
arr

array([[ 0.1924,  0.631 ,  0.9792],
       [-1.8514,  0.9893, -0.3324],
       [-1.0732,  0.1328, -0.8793],
       [ 1.4564,  1.751 , -0.7377]])

In [69]:
row_means = arr.mean(1)
row_means.reshape((4, 1))

array([[ 0.6009],
       [-0.3982],
       [-0.6066],
       [ 0.8233]])

In [70]:
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([-0.,  0.,  0.,  0.])

### 沿其他轴向广播

In [71]:
arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [72]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.4085,  0.0302,  0.3783],
       [-1.4533,  1.3875,  0.0658],
       [-0.4666,  0.7393, -0.2727],
       [ 0.6332,  0.9277, -1.5609]])

In [73]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(4, 1, 4)

In [74]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]

array([[-1.3006],
       [-0.8702],
       [-1.2481]])

In [75]:
arr_1d[np.newaxis, :]

array([[-1.3006, -0.8702, -1.2481]])

In [76]:
arr = randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means

array([[ 0.5728, -0.2739, -0.0869, -0.3664],
       [-0.1327,  0.4137, -0.8679, -0.9513],
       [-0.5004, -0.2417,  0.0965,  0.0331]])

In [77]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[-0.,  0.,  0.,  0.],
       [ 0., -0., -0.,  0.],
       [ 0., -0.,  0.,  0.]])

In [78]:
def demean_axis(arr, axis=0): 
    means = arr.mean(axis)
    # This generalized things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### 通过广播设置数组的值

In [79]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [80]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [81]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## ufunc高级应用

### ufunc实例方法

In [82]:
arr = np.arange(10)
np.add.reduce(arr)

45

In [83]:
arr.sum()

45

In [84]:
arr = randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False, False, False, False],
       [ True,  True,  True,  True],
       [False,  True, False,  True],
       [ True,  True,  True,  True]])

In [85]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

In [86]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [88]:
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

In [89]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [90]:
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape

(3, 4, 5)

In [91]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

In [92]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

In [93]:
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

### 自定义ufunc

In [94]:
def add_elements(x, y): 
    return x + y

In [95]:
add_them = np.frompyfunc(add_elements, 2, 1)

In [96]:
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [97]:
add_them = np.vectorize(add_elements, otypes=[np.float64])

In [98]:
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [99]:
arr = randn(10000)

In [100]:
%timeit add_them(arr, arr)

100 loops, best of 3: 2.69 ms per loop


In [101]:
%timeit np.add(arr, arr)

The slowest run took 9.14 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.69 µs per loop


## 结构化和记录式数组

In [102]:
dtype = [('x', np.float64), ('y', np.int32)]

In [103]:
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

In [104]:
sarr[0]

(1.5, 6)

In [105]:
sarr['x']

array([1.5   , 3.1416])

### 嵌套dtype和多维字段

In [106]:
dtype = [('x', np.int64, 3), ('y', np.int32)]

In [107]:
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [108]:
arr[0]['x']

array([0, 0, 0])

In [109]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [110]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]

In [111]:
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

In [112]:
data['y']

array([5, 6], dtype=int32)

In [113]:
data['x']['a']

array([1., 3.])

### 为什么要用结构化数组

### 结构化数组操作：numpy.lib.recfunctions

## 更多有关排序的话题

In [114]:
arr = randn(6)
arr.sort()
arr

array([-1.2059, -0.9398, -0.4613, -0.1112,  1.3925,  1.559 ])

In [115]:
arr = randn(3, 5)
arr

array([[-0.0525, -1.573 ,  0.3441, -0.3125,  0.3886],
       [ 0.1179, -2.3202,  0.1361,  0.8549, -1.2729],
       [ 1.0333,  0.4769, -0.1995, -0.1466,  1.0717]])

In [116]:
arr[:, 0].sort() # Sort first column values in-place
arr

array([[-0.0525, -1.573 ,  0.3441, -0.3125,  0.3886],
       [ 0.1179, -2.3202,  0.1361,  0.8549, -1.2729],
       [ 1.0333,  0.4769, -0.1995, -0.1466,  1.0717]])

In [117]:
arr = randn(5)
arr

array([-0.4181,  0.8458, -1.7315,  1.2193,  0.4257])

In [118]:
np.sort(arr)

array([-1.7315, -0.4181,  0.4257,  0.8458,  1.2193])

In [119]:
arr

array([-0.4181,  0.8458, -1.7315,  1.2193,  0.4257])

In [120]:
arr = randn(3, 5)
arr

array([[-0.5942,  0.1719, -1.1502,  0.0596, -0.3428],
       [-0.3047,  0.3444,  0.5657, -1.726 ,  1.5161],
       [-0.0289, -1.6442,  1.2785, -1.7491,  0.4361]])

In [121]:
arr.sort(axis=1)
arr

array([[-1.1502, -0.5942, -0.3428,  0.0596,  0.1719],
       [-1.726 , -0.3047,  0.3444,  0.5657,  1.5161],
       [-1.7491, -1.6442, -0.0289,  0.4361,  1.2785]])

In [122]:
arr[:, ::-1]

array([[ 0.1719,  0.0596, -0.3428, -0.5942, -1.1502],
       [ 1.5161,  0.5657,  0.3444, -0.3047, -1.726 ],
       [ 1.2785,  0.4361, -0.0289, -1.6442, -1.7491]])

### 间接排序：argsort和lexsort

In [123]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0])

In [124]:
values[indexer]

array([0, 1, 2, 3, 5])

In [125]:
arr = randn(3, 5)
arr[0] = values
arr

array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [-1.4193,  0.8068, -0.6691, -0.1278, -0.6888],
       [-1.6339, -0.6925,  1.5735,  1.6112, -0.1446]])

In [126]:
arr[:, arr[0].argsort()]

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [ 0.8068, -0.6691, -0.6888, -0.1278, -1.4193],
       [-0.6925,  1.5735, -0.1446,  1.6112, -1.6339]])

In [127]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))

In [128]:
zip(last_name[sorter], first_name[sorter])

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### 其他排序算法

In [129]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])

In [130]:
indexer = key.argsort(kind='mergesort')
indexer

array([2, 3, 4, 0, 1])

In [131]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='|S8')

### numpy.searchsorted：在有序数组中查找元素

In [132]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

In [133]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

In [135]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])

array([0, 3])

In [136]:
arr.searchsorted([0, 1], side='right')

array([3, 7])

In [137]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([8558., 3982., 9928., 4203., 3824., 5624., 7851., 3748., 1211.,
       4168., 6658., 3211., 1317., 8184., 4455., 5184., 9236., 8889.,
       2287., 5736., 7113., 3716., 6988., 7142., 7262., 7527., 6601.,
       1800., 1593., 1959., 6272., 5857., 1784., 8775.,  510., 9653.,
       6687., 1927., 8836., 5936., 2570., 2704., 1579., 4562., 8765.,
       9615., 5588., 1593., 9239., 3736.])

In [138]:
labels = bins.searchsorted(data)
labels

array([4, 3, 4, 3, 3, 4, 4, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3,
       4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 3, 4, 2, 4, 4, 3, 4, 4, 3, 3, 3, 3,
       4, 4, 4, 3, 4, 3])

In [139]:
Series(data).groupby(labels).mean()

2     510.000000
3    2814.954545
4    7544.592593
dtype: float64

In [140]:
np.digitize(data, bins)

array([4, 3, 4, 3, 3, 4, 4, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3,
       4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 3, 4, 2, 4, 4, 3, 4, 4, 3, 3, 3, 3,
       4, 4, 4, 3, 4, 3])

## NumPy的Matrix类

In [144]:
X = np.array([[ 8.82768214, 3.82222409, -1.14276475,2.04411587],
               [ 3.82222409, 6.75272284, 0.83909108, 2.08293758],
                [-1.14276475, 0.83909108, 5.01690521, 0.79573241],
                 [ 2.04411587, 2.08293758, 0.79573241, 6.24095859]])
X[:, 0] # one-dimensional

array([ 8.8277,  3.8222, -1.1428,  2.0441])

In [146]:
y = X[:, :1] # two-dimensional by slicing
X

array([[ 8.8277,  3.8222, -1.1428,  2.0441],
       [ 3.8222,  6.7527,  0.8391,  2.0829],
       [-1.1428,  0.8391,  5.0169,  0.7957],
       [ 2.0441,  2.0829,  0.7957,  6.241 ]])

In [147]:
y

array([[ 8.8277],
       [ 3.8222],
       [-1.1428],
       [ 2.0441]])

In [148]:
np.dot(y.T, np.dot(X, y))

array([[1195.468]])

In [149]:
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm

matrix([[ 8.8277,  3.8222, -1.1428,  2.0441],
        [ 3.8222,  6.7527,  0.8391,  2.0829],
        [-1.1428,  0.8391,  5.0169,  0.7957],
        [ 2.0441,  2.0829,  0.7957,  6.241 ]])

In [150]:
ym

matrix([[ 8.8277],
        [ 3.8222],
        [-1.1428],
        [ 2.0441]])

In [151]:
ym.T * Xm * ym

matrix([[1195.468]])

In [152]:
Xm.I * X

matrix([[ 1.,  0., -0.,  0.],
        [-0.,  1., -0.,  0.],
        [ 0., -0.,  1.,  0.],
        [-0.,  0.,  0.,  1.]])

## 高级数组输入输出

### 内存映像文件

In [153]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [154]:
section = mmap[:5]

In [155]:
section[:] = np.random.randn(5, 10000)

In [156]:
mmap.flush()
mmap

memmap([[ 0.1643, -0.1085, -0.1769, ..., -1.045 , -0.0592, -0.5496],
        [-0.387 , -1.3732, -0.6648, ...,  1.4576,  1.4388, -0.2837],
        [ 0.7773,  0.3055,  1.3582, ...,  0.9066, -0.5264, -1.4924],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [157]:
del mmap

In [158]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[ 0.1643, -0.1085, -0.1769, ..., -1.045 , -0.0592, -0.5496],
        [-0.387 , -1.3732, -0.6648, ...,  1.4576,  1.4388, -0.2837],
        [ 0.7773,  0.3055,  1.3582, ...,  0.9066, -0.5264, -1.4924],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

### HDF5及其他数组存储方式

## 性能建议

### 连续内存的重要性

In [159]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')

In [160]:
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [161]:
arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [162]:
arr_f.flags.f_contiguous

True

In [163]:
%timeit arr_c.sum(1)

1000 loops, best of 3: 617 µs per loop


In [164]:
%timeit arr_f.sum(1)

1000 loops, best of 3: 718 µs per loop


In [165]:
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [166]:
arr_c[:50].flags.contiguous

True

In [167]:
arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

### 其他加速手段：Cython, f2py,C

In [173]:
'''
from numpy c import ndarray, float64_t
def sum_elements(ndarray[float64_t] arr): 
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0
    for i in range(n): 
        result += arr[i]
    return result
'''

'\nfrom numpy c import ndarray, float64_t\ndef sum_elements(ndarray[float64_t] arr): \n    cdef Py_ssize_t i, n = len(arr)\n    cdef float64_t result = 0\n    for i in range(n): \n        result += arr[i]\n    return result\n'