## ndarray

In [1]:
# ndarray在数据分析中的作用:
# index可以取出我想要的数据
# 加减 exp运算有map的功能
# broadcast有filter功能
# 所有基本统计方法有aggregation的功能
# 遍历尽量少用, filter+map可以取子矩阵计算, 子矩阵之后自然要合并

In [19]:
import numpy as np
# ndarray, n维数组对象, 对于维数的理解 - 图像数据为什么三维 ? 
# 图像本身在几何空间上是二维, 而所有的颜色都是有三原色组成, 所以图像的三维分别是 色*平面空间的二维
# shape的深入理解: 每个维数的个数组成的tuple
# 一维矩阵和二维矩阵的本质理解: 一维矩阵就是向量, 二维矩阵是向量的集合, 虽然集合里可能只有一个向量, 比如 1*3矩阵
# ndarray的作用: 快速的数学计算

# 通过list创建ndarray
# 取得ndarray的shape
x = np.array([1,2,3])
print('ndarray type is : {}'.format(type(x)))
print('One dimension ndarray dimension is : {}'.format(x.ndim))
print('One dimension ndarray shape is : {}'.format(x.shape))

#一维矩阵和二维矩阵的本质理解
y = np.array([[1, 2, 3]])
print('Two dimension single vector ndarray is : {}'.format(y))
print('Two dimension single vector ndarray dimension is : {}'.format(y.ndim))
print('Two dimension single vector ndarray shape is : {}'.format(y.shape))

x = np.array([[1, 2, 3], [4, 5, 6]])
print('Two dimension ndarray dimension is : {}'.format(x.ndim))
print('Two dimension ndarray shape is : {}'.format(x.shape))

x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) # 想象成三维空间矩阵, 或者图像的例子
print('Three dimension ndarray dimension is : {}'.format(x.ndim))
print('Three dimension ndarray shape is : {}'.format(x.shape))
print('Shape of a ndarray is: {}'.format(type(x.shape)))
print(x.dtype)

# 通过arrange创建ndarray
x = np.arange(10)
print('One dimension arranged ndarray dimension is : {}'.format(x))
x = np.arange(5, 10)
print('One dimension arranged start from 5 ndarray dimension is : {}'.format(x))
x = np.arange(0, 10, 2)
print('One dimension arranged step 2 ndarray dimension is : {}'.format(x))

# 通过linespace创建ndarray
x = np.linspace(1.0, 3.0, 5)
print('One dimension arranged step 0.5 ndarray dimension is : {}'.format(x))

# 通过随机数生成矩阵
x = np.random.randint(0, 10, (4, 3))
print('A random 4 row and 3 column array is : {}'.format(x))

# 几个特殊矩阵: 零矩阵, 单位矩阵
x = np.zeros((4, 3))
print('A all-zero array is : {}'.format(x))
x = np.ones((4, 3))
print('A all-one array is : {}'.format(x))
# 一个Vector通过repeat生成矩阵
print('The first type repeated ndarray is : {}'.format(np.array([1, 2, 3] * 3)))
print('The second type repeated ndarray is : {}'.format(np.repeat([1, 2, 3], 3)))

# 矩阵的reshape: 知道reshape的行数和列数;只知道reshape的列数,不知道行数,尤其是列数为1的情况
x = np.arange(0, 100, 5).reshape(5, 4)
print('Reshpae with number of row and colunms : {}'.format(x))
x = np.arange(0, 100, 5).reshape(-1, 5)
print('Reshpae with number of only colunms : {}'.format(x))


ndarray type is : <class 'numpy.ndarray'>
One dimension ndarray dimension is : 1
One dimension ndarray shape is : (3,)
Two dimension single vector ndarray is : [[1 2 3]]
Two dimension single vector ndarray dimension is : 2
Two dimension single vector ndarray shape is : (1, 3)
Two dimension ndarray dimension is : 2
Two dimension ndarray shape is : (2, 3)
Three dimension ndarray dimension is : 3
Three dimension ndarray shape is : (2, 3, 3)
Shape of a ndarray is: <class 'tuple'>
int32
One dimension arranged ndarray dimension is : [0 1 2 3 4 5 6 7 8 9]
One dimension arranged start from 5 ndarray dimension is : [5 6 7 8 9]
One dimension arranged step 2 ndarray dimension is : [0 2 4 6 8]
One dimension arranged step 0.5 ndarray dimension is : [1.  1.5 2.  2.5 3. ]
A random 4 row and 3 column array is : [[7 7 8]
 [1 6 0]
 [3 0 1]
 [4 2 9]]
A all-zero array is : [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
A all-one array is : [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
The first type 

In [3]:
# ndarray index 数据索引和切片
# 数组index的多维版本, numpy行优先, 如何取得一个维度上的所有数据
# boolean数组作为mask, 格式为X1[X2], 称X2为X1的遮罩
# 向量比较标量, ndarray的广播 r > 30, 生成和r相同维的Boolean矩阵

# 练习1: 创建Matrix, 取得某个元素, 某一行, 某一列, 子矩阵
x = np.arange(0, 100, 5).reshape(-1, 5)
print('First row of Matrix : {}'.format(x[0, :]))
print('First element of Matrix : {}'.format(x[0, 0]))
print('First column of Matrix : {}'.format(x[:, 0]))
print('Part of first row of Matrix : {}'.format(x[0, 2:5]))

# 练习2： Filter 6*6矩阵, 新矩阵包含取得所有大于30的元素
x = np.arange(1, 37, 1).reshape(6, 6)
print('Prove the Mask Matrix is still ndarray {}'.format(type(x[x > 20])))
print('Element with Mask to filter : {}'.format(x[x > 20]))
print('Broadcast Matrix is : {} '.format(x > 20))

# 练习3： 6*6矩阵, 把所有大于30的元素变成30
x[x > 20] = 20
print('Updated Broadcast Matrix is : {} '.format(x))

# 练习4: Test mask
x = np.arange(1, 7).reshape(2, 3)
y = np.array([True, False] * 3).reshape(2 ,3)
print(x)
print(y)
print('If the Mask is same shape as Matrix, apply to all elements {}'.format(x[y]))

z = np.array([False, True])
print(z)
print('If the Mask is only one row, apply this row to each column of Matrix {}'.format(x[z]))


First row of Matrix : [ 0  5 10 15 20]
First element of Matrix : 0
First column of Matrix : [ 0 25 50 75]
Part of first row of Matrix : [10 15 20]
Prove the Mask Matrix is still ndarray <class 'numpy.ndarray'>
Element with Mask to filter : [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36]
Broadcast Matrix is : [[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False  True  True  True  True]
 [ True  True  True  True  True  True]
 [ True  True  True  True  True  True]] 
Updated Broadcast Matrix is : [[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]
 [19 20 20 20 20 20]
 [20 20 20 20 20 20]
 [20 20 20 20 20 20]] 
[[1 2 3]
 [4 5 6]]
[[ True False  True]
 [False  True False]]
If the Mask is same shape as Matrix, apply to all elements [1 3 5]
[False  True]
If the Mask is only one row, apply this row to each column of Matrix [[4 5 6]]


In [23]:
# 两个矩阵横向或者纵向叠加
# 矩阵的基本操作: 加减乘, 取得数据类型
# 矩阵的常用统计方法: 平均值, 中位数, 最大最小值, 标准差, argmax, argmin;
# 注意矩阵的mean和np的mean的不同, np的mean更广泛, 可以求出矩阵以行或列为单位的平均值
# 注意axis, 默认是0为纵向, 设置为1才能横向
# 理解矩阵的切片是取得真实对象, 进而研究如何copy
# 一维矩阵无法转置

# 练习1: 横向和纵向叠加两个3*3矩阵
x = np.arange(1, 10).reshape(3, 3)
y = np.arange(11, 20).reshape(3, 3)
print(np.vstack((x, y)))
print(np.hstack((x, y)))

# 练习2: 两个3*3矩阵的加减乘
print(x + y)
print(x - y)
print(x * y)
print(x.dot(y))
print(x ** x)
# 练习3: 两个3*3矩阵取得其数据类型
# 练习4: 矩阵的基本统计方法 - sum, min, max, mean, std, argmax, argmin
# 
a = np.array([1, 2, 3, 4, 5])
print('sum: ', a.sum())
print('min: ', a.min())
print('max: ', a.max())
print('mean: ', a.mean())
print('std: ', a.std()) # 标准差
print('argmax: ', a.argmax())
print('argmin: ', a.argmin())

x = np.arange(0, 6).reshape(3, 2)
print('orginal matrix', x)
print('mean vector of each column and its shape', np.mean(x, axis=0), np.mean(x, axis=0).shape)
print('mean vector of each row and its shape', np.mean(x, axis=1), np.mean(x, axis=1).shape)
# 练习5: 通过切片得到子矩阵, 改变值, 影响原始矩阵; 尝试通过copy, 对比两个结果
z = x[0:2, 0:2]
print(z)
z[:] = 0
print(x)

x2 = x.copy()
x2[:] = 0
print(x)

# 练习6: 矩阵的转置, 一维矩阵转置是它本身
x = np.arange(1, 16).reshape(3, 5)
print(x.shape)
print(x.T.shape)
print(x.T)

x = np.array([1,2,3])
print(x)
print(x.T)

# any and all
x = np.array([True, False] * 3).reshape(2 ,3)
print(x.all())
print(x.any())
print(np.unique(x))
print(np.unique(np.arange(1, 16).reshape(3, 5)))

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [11 12 13]
 [14 15 16]
 [17 18 19]]
[[ 1  2  3 11 12 13]
 [ 4  5  6 14 15 16]
 [ 7  8  9 17 18 19]]
[[12 14 16]
 [18 20 22]
 [24 26 28]]
[[-10 -10 -10]
 [-10 -10 -10]
 [-10 -10 -10]]
[[ 11  24  39]
 [ 56  75  96]
 [119 144 171]]
[[ 90  96 102]
 [216 231 246]
 [342 366 390]]
[[        1         4        27]
 [      256      3125     46656]
 [   823543  16777216 387420489]]
sum:  15
min:  1
max:  5
mean:  3.0
std:  1.4142135623730951
argmax:  4
argmin:  0
orginal matrix [[0 1]
 [2 3]
 [4 5]]
mean vector of each column and its shape [2. 3.] (2,)
mean vector of each row and its shape [0.5 2.5 4.5] (3,)
[[0 1]
 [2 3]]
[[0 0]
 [0 0]
 [4 5]]
[[0 0]
 [0 0]
 [4 5]]
(3, 5)
(5, 3)
[[ 1  6 11]
 [ 2  7 12]
 [ 3  8 13]
 [ 4  9 14]
 [ 5 10 15]]
[1 2 3]
[1 2 3]
False
True
[False  True]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


## 尽量直接操作大对象: ndarray或者list或者dict等等, 不要使用iterator
基本思路：“一次”在一个复杂对象上进行操作，或者向其应用某个函数，而不是通过在对象的单个元素上循环来进行;

在Python级别上，函数式编程工具map，filter和reduce提供了向量化的手段;

在NumPy级别上，在ndarray对象上进行的循环由经过高度优化的代码负责，大部分代码用C语言编写，远快于纯Python;

In [None]:
# 按照行遍历
x = np.arange(1, 10).reshape(3, 3)
print(x)
for row in x:
    print(row)
# 按照行遍历 但是拿到row index
for i, row in enumerate(x):
    print(i, row)
# 向量化操作, 矩阵每个元素平方, 直接以矩阵为单位操作; 练习
print(x ** 2)
# 以zip遍历两个ndarray; 练习
y = np.arange(11, 20).reshape(3, 3)
for i, j in zip(x, y):
    print(i, j, i + j)

In [None]:
# 练习题5: 为什么不选B, 明确向量和单行矩阵或者单列矩阵的区别; 向量不可以转置