# Numpy实用示例

## 1. 导入numpy包，并查看相关版本

In [1]:
import numpy as np
print(np.__version__)

1.16.2


## 2. 如何创建1维向量？

In [2]:
arr = np.arange(10)
print(arr)

[0 1 2 3 4 5 6 7 8 9]


## 3. 如何创建一个布尔矩阵？

In [4]:
arr = np.full([3,4], True, dtype=bool)
print(arr)

[[ True  True  True  True]
 [ True  True  True  True]
 [ True  True  True  True]]


## 4. 如何从1维整数向量查找奇数？

In [6]:
arr = np.arange(10)
print(arr)
print(arr % 2 == 1)
print(arr[arr % 2 == 1])
# print(arr.astype(int))

[0 1 2 3 4 5 6 7 8 9]
[False  True False  True False  True False  True False  True]
[1 3 5 7 9]


## 5. 如何将array中的奇数都替换为-1？

In [7]:
arr = np.arange(10)
arr[arr % 2 == 1] = -1
print(arr)

[ 0 -1  2 -1  4 -1  6 -1  8 -1]


## 6. 如何在不改变原array的前提下，将奇数都替换为-1？

In [9]:
arr = np.arange(10)
out = np.where(arr % 2 == 1, -1, arr)
print(arr)
print(out)

[0 1 2 3 4 5 6 7 8 9]
[ 0 -1  2 -1  4 -1  6 -1  8 -1]


## 7. 如何reshape array?

In [10]:
arr = np.arange(10)
print(arr.reshape([2, 5]))

[[0 1 2 3 4]
 [5 6 7 8 9]]


## 8. 如何将两个array垂直叠加？

In [16]:
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)
print(b)
print('method 1: vstack')
print(np.vstack([a, b]))
print('method 2: concatenate')
print(np.concatenate([a, b], axis=0))
print('method 3: r_')
print(np.r_[a, b])

[[1 1 1 1 1]
 [1 1 1 1 1]]
method 1: vstack
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
method 2: concatenate
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
method 3: r_
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]


In [21]:
a = np.arange(10).reshape(-1, 2)
a

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

## 9. 如何将两个array水平叠加？

In [22]:
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)
print('method 1: hstack')
print(np.hstack([a, b]))
print('method 2: concatenate')
print(np.concatenate([a, b], axis=1))
print('method 3: c_')
print(np.c_[a, b])

method 1: hstack
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
method 2: concatenate
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
method 3: c_
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]


## 10. 如何创建非硬编码的自定义序列？

如array: ```[1, 2, 3]```, 期望输出：```[1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]```

In [24]:
arr = np.arange(1, 4)
print(arr)

[1 2 3]


In [26]:
np.repeat(arr, 3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

In [27]:
np.tile(arr, 3)

array([1, 2, 3, 1, 2, 3, 1, 2, 3])

In [28]:
print(np.hstack([np.repeat(arr, 3), np.tile(arr, 3)]))

[1 1 1 2 2 2 3 3 3 1 2 3 1 2 3 1 2 3]


In [29]:
print(np.r_[np.repeat(arr, 3), np.tile(arr, 3)])

[1 1 1 2 2 2 3 3 3 1 2 3 1 2 3 1 2 3]


## 11. 如何从两个array获得共同items？

In [30]:
a = np.array([1, 2, 3, 2, 3, 4, 3, 4, 5, 6])
b = np.array([7, 2, 10, 2, 7, 4, 9, 4, 9, 8])

In [33]:
print(np.intersect1d(a, b))

[2 4]


## 12. 如何从array a移除与array b的共同项？

In [35]:
a = np.array([1, 2, 3, 2, 3, 4, 3, 4, 5, 6])
b = np.array([7, 2, 10, 2, 7, 4, 9, 4, 9, 8])

In [37]:
print(np.setdiff1d(a, b))

[1 3 5 6]


In [38]:
print(np.setdiff1d(b, a))

[ 7  8  9 10]


## 13. 如何获得两个array共有项的位置索引？

In [53]:
a = np.array([1,2,3,2,3,4,3,4,5,6,10])
b = np.array([7,2,10,2,7,4,9,4,6,8,1])

In [54]:
print(np.where(a == b))

(array([1, 3, 5, 7], dtype=int64),)


## 14. 如何从一个array获得给定范围数字的项？

In [55]:
a = np.arange(15)

In [56]:
# 期望获得5 <= items <= 10
# 方法1
print(a[(a >=5) & (a <= 10)])
# 方法2
index = np.where((a >=5) & (a <= 10))
print(a[index])
# 方法3
index = np.logical_and(a >= 5, a <= 10)
print(a[index])

[ 5  6  7  8  9 10]
[ 5  6  7  8  9 10]
[ 5  6  7  8  9 10]


## 15. 如何使得作用于标量的函数，可以用于numpy的array对象？

In [59]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

maxx(1, 5)

5

In [63]:
pair_max = np.vectorize(maxx, otypes=[float])

a = np.array([5, 7, 9, 8, 6, 4, 9])
b = np.array([6, 3, 4, 8, 9, 7, 1])

pair_max(a, b)

array([6., 7., 9., 8., 9., 7., 9.])

In [61]:
maxx(a, b)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

   ## 16. 如何对换矩阵中的两列？

In [65]:
arr = np.arange(9).reshape(3, 3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [71]:
arr[:, [1, 0, 2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

## 17. 如何对换矩阵中的两行？

In [72]:
arr[[1, 0 ,2], :]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

## 18. 如何翻转矩阵中的行？

In [73]:
arr[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [100]:
# 或者
arr[::-1,:]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

## 19. 如何翻转矩阵中的列？

In [105]:
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [104]:
arr[:, ::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

## 20. 如何生成5行3列的大小5~10的浮点数矩阵？

In [106]:
# 方法1
randarr = np.random.randint(low=5, high=10, size=(5, 3)) + np.random.random(size=(5, 3))
randarr

array([[7.7011429 , 7.93068464, 8.11849224],
       [9.60217948, 9.29352205, 5.27212148],
       [6.40555075, 5.97269465, 5.405912  ],
       [5.94734237, 5.67408459, 7.61719107],
       [9.96166872, 9.92678792, 5.8857366 ]])

In [107]:
# 方法2
randarr = np.random.uniform(low=5, high=10, size=(5, 3))
randarr

array([[7.18452475, 8.27389016, 8.5127937 ],
       [5.36889208, 5.82206409, 7.43290589],
       [6.53318654, 6.5431223 , 6.09156143],
       [8.81242927, 5.21313256, 9.28885935],
       [8.52213076, 5.52807631, 7.18314198]])

## 21. 如何设置浮点型矩阵小数精度为3位？

In [109]:
np.set_printoptions(precision=3)
randarr

array([[7.185, 8.274, 8.513],
       [5.369, 5.822, 7.433],
       [6.533, 6.543, 6.092],
       [8.812, 5.213, 9.289],
       [8.522, 5.528, 7.183]])

## 22. 如何使用小数形式替代科学计数法？

In [111]:
# Create the random array
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

array([[5.434e-04, 2.784e-04, 4.245e-04],
       [8.448e-04, 4.719e-06, 1.216e-04],
       [6.707e-04, 8.259e-04, 1.367e-04]])

In [112]:
# Reset printoptions to default
np.set_printoptions(suppress=False)
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

array([[5.434e-04, 2.784e-04, 4.245e-04],
       [8.448e-04, 4.719e-06, 1.216e-04],
       [6.707e-04, 8.259e-04, 1.367e-04]])

In [115]:
np.set_printoptions(suppress=True, precision=6)
rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

## 23. 如何限制array显示数目？

In [127]:
import sys
a = np.arange(15)
# 新版本的numpy的用法
np.set_printoptions(threshold=sys.maxsize)
# 旧版本的用法
# np.set_printoptions(threshold=np.nan)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [128]:
np.set_printoptions(threshold=7)
a

array([ 0,  1,  2, ..., 12, 13, 14])

## 24. 如何完整显示array数目？

In [40]:
np.set_printoptions(threshold=6)
a

array([ 0,  1,  2, ..., 12, 13, 14])

In [41]:
np.set_printoptions(threshold=np.nan)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

## 25. 如何导入既含有数字又含有字符串的数据集？

In [137]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
print(iris[:3])
print(iris.shape)

[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']]
(150, 5)


## 26. 如何从1维元组向量获得特定列？（此例为鸢尾花名称）

In [140]:
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding='utf-8')
print(iris_1d[:2])
print(iris_1d.shape)

# Solution:
species = np.array([row[4] for row in iris_1d])
species[:5]

[(5.1, 3.5, 1.4, 0.2, 'Iris-setosa') (4.9, 3. , 1.4, 0.2, 'Iris-setosa')]
(150,)


array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

## 27. 如何将1维元组向量转为2维矩阵？

In [141]:
# Method 1: Convert each row to a list and get the first 4 items
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding='utf-8')
# 将索引为0~3的列转为矩阵
iris_2dfrom1dtuple = np.array([row.tolist()[:4] for row in iris_1d])
print(len(iris_2dfrom1dtuple))
print(iris_2dfrom1dtuple[:4])

print('直接作为矩阵导入')
# Alt Method 2: Import only the first 4 columns from source url
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3], encoding='utf-8')
print(iris_2d[:4])

150
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]
直接作为矩阵导入
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]


## 28. 如何计算均值，中位数以及标准差？

In [142]:
print(iris[:4])

[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']]


In [143]:
# 直接通过已经初始化的矩阵获取
sepallength = iris[:, 0].astype(float)
print(sepallength[:10])

# 重新获取数据，且只获取需要的列
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
print(sepallength[:10])

[5.1 4.9 4.7 ... 5.  4.4 4.9]
[5.1 4.9 4.7 ... 5.  4.4 4.9]


In [144]:
mean, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mean, med, sd)

5.843333333333334 5.8 0.8253012917851409


## 29. 如何归一化矩阵中的数据？

归一化: (X - min) / (max - min)

In [146]:
np.set_printoptions(threshold=sys.maxsize)
Smax, Smin = sepallength.max(), sepallength.min()
print('最大值: {0}'.format(Smax))
print('最小值: {0}'.format(Smin))
print('原数据: ')
print(sepallength)
# 方法1
S = (sepallength - Smin)/(Smax - Smin)
print('归一化后数据: ')
print(S)

最大值: 7.9
最小值: 4.3
原数据: 
[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  5.  5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
 5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5.  7.  6.4 6.9 5.5
 6.5 5.7 6.3 4.9 6.6 5.2 5.  5.9 6.  6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1
 6.3 6.1 6.4 6.6 6.8 6.7 6.  5.7 5.5 5.5 5.8 6.  5.4 6.  6.7 6.3 5.6 5.5
 5.5 6.1 5.8 5.  5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.  6.9 5.6 7.7 6.3 6.7 7.2
 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.  6.9 6.7 6.9 5.8 6.8
 6.7 6.7 6.3 6.5 6.2 5.9]
归一化后数据: 
[0.222222 0.166667 0.111111 0.083333 0.194444 0.305556 0.083333 0.194444
 0.027778 0.166667 0.305556 0.138889 0.138889 0.       0.416667 0.388889
 0.305556 0.222222 0.388889 0.222222 0.305556 0.222222 0.083333 0.222222
 0.138889 0.194444 0.194444 0.25     0.25     0.111111 0.138889 0.305556
 0.25     0.333333 0.166667 0.194444 0.333333 0.166667 0.027778 0.

In [147]:
# 方法2 
S = (sepallength - Smin)/sepallength.ptp()
print(S)

[0.222222 0.166667 0.111111 0.083333 0.194444 0.305556 0.083333 0.194444
 0.027778 0.166667 0.305556 0.138889 0.138889 0.       0.416667 0.388889
 0.305556 0.222222 0.388889 0.222222 0.305556 0.222222 0.083333 0.222222
 0.138889 0.194444 0.194444 0.25     0.25     0.111111 0.138889 0.305556
 0.25     0.333333 0.166667 0.194444 0.333333 0.166667 0.027778 0.222222
 0.194444 0.055556 0.027778 0.194444 0.222222 0.138889 0.222222 0.083333
 0.277778 0.194444 0.75     0.583333 0.722222 0.333333 0.611111 0.388889
 0.555556 0.166667 0.638889 0.25     0.194444 0.444444 0.472222 0.5
 0.361111 0.666667 0.361111 0.416667 0.527778 0.361111 0.444444 0.5
 0.555556 0.5      0.583333 0.638889 0.694444 0.666667 0.472222 0.388889
 0.333333 0.333333 0.416667 0.472222 0.305556 0.472222 0.666667 0.555556
 0.361111 0.333333 0.333333 0.5      0.416667 0.194444 0.361111 0.388889
 0.388889 0.527778 0.222222 0.388889 0.555556 0.416667 0.777778 0.555556
 0.611111 0.916667 0.166667 0.833333 0.666667 0.805556 0.6111

In [74]:
np.set_printoptions(threshold=np.nan)

## 30. 如何计算softmax的值？

Softmax 在机器学习和深度学习中有着非常广泛的应用。尤其在处理多分类（C > 2）问题，分类器最后的输出单元需要Softmax 函数进行数值处理。

<img src='./image/softmaxf.jpg' />

下图通过softmax进行3分类：

<img src='./image/softmax.jpg' />

实际应用中，使用 Softmax 需要注意数值溢出的问题。因为有指数运算，如果 V 数值很大，经过指数运算后的数值往往可能有溢出的可能。所以，需要对 V 进行一些数值处理：即 V 中的每个元素减去 V 中的最大值。

In [148]:
def softmax(x):
    """Compute softmax values for each sets of scores in x.
    https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python"""
    e_x = np.exp(x - np.max(x))
#     print(e_x)
#     print('e_x.sum(axis=0)')
#     print(e_x.sum(axis=0))
#     print('e_x.sum()')
#     print(e_x.sum())
    return e_x / e_x.sum(axis=0)

In [151]:
print('原数据: ')
print(sepallength)
print('softmax数据: ')
softmax_array = softmax(sepallength)
print(softmax_array)
print(softmax_array[np.argmax(softmax_array)])

原数据: 
[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  5.  5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
 5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5.  7.  6.4 6.9 5.5
 6.5 5.7 6.3 4.9 6.6 5.2 5.  5.9 6.  6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1
 6.3 6.1 6.4 6.6 6.8 6.7 6.  5.7 5.5 5.5 5.8 6.  5.4 6.  6.7 6.3 5.6 5.5
 5.5 6.1 5.8 5.  5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.  6.9 5.6 7.7 6.3 6.7 7.2
 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.  6.9 6.7 6.9 5.8 6.8
 6.7 6.7 6.3 6.5 6.2 5.9]
softmax数据: 
[0.00222  0.001817 0.001488 0.001346 0.002008 0.002996 0.001346 0.002008
 0.001102 0.001817 0.002996 0.001644 0.001644 0.000997 0.00447  0.004044
 0.002996 0.00222  0.004044 0.00222  0.002996 0.00222  0.001346 0.00222
 0.001644 0.002008 0.002008 0.002453 0.002453 0.001488 0.001644 0.002996
 0.002453 0.003311 0.001817 0.002008 0.003311 0.001817 0.001102 0.00222
 0.002008 

## 31. 如何计算array中的百分位数(Percentile)的值？

Percentile：百分位数是统计中使用的度量，表示小于这个值的观察值占总数q的百分比

In [152]:
print(sepallength)

[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  5.  5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
 5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5.  7.  6.4 6.9 5.5
 6.5 5.7 6.3 4.9 6.6 5.2 5.  5.9 6.  6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1
 6.3 6.1 6.4 6.6 6.8 6.7 6.  5.7 5.5 5.5 5.8 6.  5.4 6.  6.7 6.3 5.6 5.5
 5.5 6.1 5.8 5.  5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.  6.9 5.6 7.7 6.3 6.7 7.2
 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.  6.9 6.7 6.9 5.8 6.8
 6.7 6.7 6.3 6.5 6.2 5.9]


sigmoid, <-4，无限接近0， > 4, 无限接近1

In [153]:
print(np.percentile(sepallength, q=[5, 96]))

[4.6   7.408]


In [155]:
# help(np.percentile)

## 32. 如何向array随机位置插入值？

如向Iris数据中，插入20个空值

In [157]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
print(iris_2d.shape[0], iris_2d.shape[1])
# Method 1
iris_2d_1 = iris_2d.copy()
i, j = np.where(iris_2d_1)
print('i是行的索引，每个索引重复列的数量构成')
print(i[:10])
print(i.shape)
print('j是列的索引: 01234重复150次')
print(j[:10])
print(j.shape)
# i, j contain the row numbers and column numbers of 600 elements of iris_x
print('method1')
np.random.seed(100)
iris_2d_1[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan
print(iris_2d_1[:15])

# Method 2
print('method2，刚才的那个方法太繁琐了，')
np.random.seed(100)
iris_2d_2 = iris_2d.copy()
iris_2d_2[np.random.randint(iris_2d_2.shape[0], size=20), 
          np.random.randint(iris_2d_2.shape[1], size=20)] = np.nan
print(iris_2d_2[:15])

150 5
i是行的索引，每个索引重复列的数量构成
[0 0 0 0 0 1 1 1 1 1]
(750,)
j是列的索引: 01234重复150次
[0 1 2 3 4 0 1 2 3 4]
(750,)
method1
[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'5.0' b'3.6' b'1.4' b'0.2' b'Iris-setosa']
 [b'5.4' b'3.9' b'1.7' b'0.4' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'5.0' b'3.4' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']
 [b'5.4' b'3.7' b'1.5' nan b'Iris-setosa']
 [b'4.8' b'3.4' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.1' b'Iris-setosa']
 [b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
 [b'5.8' b'4.0' b'1.2' b'0.2' b'Iris-setosa']]
method2，刚才的那个方法太繁琐了，
[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-seto

In [158]:
print(iris_2d_2.shape[0])
print(iris_2d_2.shape[1])
print(np.random.randint(iris_2d_2.shape[1], size=20))

150
5
[3 0 1 0 4 2 0 0 2 2 1 0 2 1 4 2 0 3 3 3]


## 33. 如何在array中找到空值的位置？

In [166]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
print(iris_2d[:, 0])
# Solution
# np.nan
# cell != cell True
print("Number of missing values: \n", np.isnan(iris_2d[:, 0]).sum())
nan_position = np.where(np.isnan(iris_2d[:, 0]))
print("Position of missing values: \n", nan_position)
iris_2d[:, 0][nan_position[0]]

[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  nan 5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 nan
 5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5.  7.  6.4 6.9 5.5
 6.5 5.7 6.3 4.9 6.6 5.2 5.  5.9 6.  6.1 5.6 nan 5.6 5.8 nan 5.6 5.9 6.1
 6.3 6.1 6.4 6.6 6.8 6.7 6.  5.7 5.5 5.5 5.8 6.  5.4 6.  6.7 nan 5.6 5.5
 5.5 6.1 5.8 5.  5.6 5.7 5.7 6.2 5.1 nan 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.  6.9 5.6 7.7 6.3 6.7 7.2
 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.  6.9 6.7 6.9 5.8 6.8
 6.7 6.7 6.3 6.5 6.2 5.9]
Number of missing values: 
 6
Position of missing values: 
 (array([26, 35, 65, 68, 87, 99], dtype=int64),)


array([nan, nan, nan, nan, nan, nan])

## 34. 如何基于2个或更多的条件查找数据？

In [91]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

# Filter the rows of iris_2d that has petallength (3rd column) > 1.5 and sepallength (1st column) < 5.0
# Solution
condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)
iris_2d[condition]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

## 35. 如何移除含有空值的行？

In [96]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Solution
# No direct numpy function for this.
# Method 1:
any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
print(iris_2d[any_nan_in_row][:5])
print()
# Method 2: 
withnonan = iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0]
print(withnonan.shape)
print(withnonan[:5])

withnan = iris_2d[np.sum(np.isnan(iris_2d), axis = 1) > 0]
print(withnan.shape)
print(withnan[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

(130, 4)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
(20, 4)
[[4.4 nan 1.4 0.2]
 [nan 3.7 1.5 0.2]
 [5.  3.  1.6 nan]
 [nan 3.4 1.6 0.4]
 [nan 3.2 1.2 0.2]]


## 36. 如何得到两列的相关系数？

Correlation coef indicates the degree of linear relationship between two numeric variables. <br>
It can range between -1 to +1. <br>
The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a correlation at least as extreme as the one computed.<br>
The lower the p-value (<0.01), stronger is the significance of the relationship.<br>
It is not an indicator of the strength.

相关系数是最早由统计学家卡尔·皮尔逊设计的统计指标，是研究变量之间线性相关程度的量，一般用字母 r 表示。由于研究对象的不同，相关系数有多种定义方式，较为常用的是皮尔逊相关系数。

相关表和相关图可反映两个变量之间的相互关系及其相关方向，但无法确切地表明两个变量之间相关的程度。相关系数是用以反映变量之间相关关系密切程度的统计指标。相关系数是按积差方法计算，同样以两变量与各自平均值的离差为基础，通过两个离差相乘来反映两变量之间相关程度；着重研究线性的单相关系数。
需要说明的是，皮尔逊相关系数并不是唯一的相关系数，但是最常见的相关系数，以下解释都是针对皮尔逊相关系数。

依据相关现象之间的不同特征，其统计指标的名称有所不同。如将反映两变量间线性相关关系的统计指标称为相关系数（相关系数的平方称为判定系数）；将反映两变量间曲线相关关系的统计指标称为非线性相关系数、非线性判定系数；将反映多元线性相关关系的统计指标称为复相关系数、复判定系数等。

相关关系是一种非确定性的关系，相关系数是研究变量之间线性相关程度的量。由于研究对象的不同，相关系数有如下几种定义方式。<br>
简单相关系数：又叫相关系数或线性相关系数，一般用字母r 表示，用来度量两个变量间的线性关系。

<img src='./image/corelation.jpg' />

其中，Cov(X,Y)为X与Y的协方差，Var[X]为X的方差，Var[Y]为Y的方差

In [167]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

# Find the correlation between 花萼长度：SepalLength(1st column) and 花瓣长度：PetalLength(3rd column) in iris_2d
# Solution 1
print(np.corrcoef(iris[:, 0], iris[:, 2])[0, 1])

# Solution 2
from scipy.stats.stats import pearsonr  
corr, p_value = pearsonr(iris[:, 0], iris[:, 2])
print(corr)

0.8717541573048718
0.8717541573048712


这个结果说明花萼与花瓣的长度还是强相关的。

## 37. 如何判定array含有nan值？

In [103]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
# iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
np.isnan(iris_2d).any()

False

## 38. 如何将array中所有缺失值替换为0？

In [105]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# print(np.isnan(iris_2d))
# Solution
iris_2d[np.isnan(iris_2d)] = 0
iris_2d[:30]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 0. , 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 0. , 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2]])

## 39. 如何从array找出唯一值的数目？

In [41]:
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Solution
# Extract the species column as an array
species = np.array([row.tolist()[4] for row in iris])

# Get the unique values and the counts
print(np.unique(species, return_counts=True))

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype='|S15'), array([50, 50, 50], dtype=int64))


## 40. 如何将作为枚举的整型数字转为类别文字？

问题：
将花瓣长度（第三列）转为文本分类，比如花瓣长度:
<3 : small
3~5 : medium
\>=5 : large

In [45]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
print(iris[:, 2][:4])

# Bin petallength 
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
print(petal_length_bin)
# Map it to respective category
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]

# View
petal_length_cat[:4]

[b'1.4' b'1.4' b'1.3' b'1.5']
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 2 3 2 3 3 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
 3 3]


['small', 'small', 'small', 'small']

## 41. 如何基于已经存在的列创建一个新列？

问题：根据下述公式 (pi x petallength x sepal_length^2)/3创建一个新列

In [49]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# Compute volume
sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength**2))/3
print(volume.shape)
print(volume[:4])
# Introduce new dimension to match iris_2d's
volume = volume[:, np.newaxis]
print(volume.shape)
print(volume[:4])
# Add the new column
out = np.hstack([iris_2d, volume])

# View
out[:4]

(150,)
[38.13265163 35.20049849 30.07237208 33.23805027]
(150, 1)
[[38.13265163]
 [35.20049849]
 [30.07237208]
 [33.23805027]]


array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

## 42. 如何根据概率随机取样？

In [62]:
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# Get the species column
species = iris[:, 4]

# 方法 1: Generate Probablistically
np.random.seed(100)
a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])
print(np.unique(species_out, return_counts=True))
print()

# 方法 2: Probablistic Sampling (preferred)
np.random.seed(100)
randomarray = np.random.random(150)
probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50), np.linspace(.751, 1.0, num=50)]
print(probs[:10])
# 将probs中的数，逐个与np.random.random(150)的成员对比，找到最适合的值的索引，从而构成排序索引矩阵
index = np.searchsorted(probs, randomarray)
print()
print(randomarray[:10])
print()
print(index[:10])
species_out = species[index]
print(np.unique(species_out, return_counts=True))

(array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='<U15'), array([77, 37, 36], dtype=int64))

[0.         0.01020408 0.02040816 0.03061224 0.04081633 0.05102041
 0.06122449 0.07142857 0.08163265 0.09183673]

[0.54340494 0.27836939 0.42451759 0.84477613 0.00471886 0.12156912
 0.67074908 0.82585276 0.13670659 0.57509333]

[ 59  28  42 119   1  12  84 115  14  65]
(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype=object), array([77, 37, 36], dtype=int64))


## 43. 以某一列分组之后，如何找到第二大的数？

In [63]:
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# 获得Itis-setosa品种的
petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')
print(petal_len_setosa)
# Get the second last value
np.unique(np.sort(petal_len_setosa))[-2]

[1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
 1.7 1.5 1.7 1.5 1.  1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
 1.3 1.5 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4]


1.7

## 44. 如何根据某一列对矩阵排序？

In [64]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
# Sort by column position 0: SepalLength
print(iris[iris[:,0].argsort()][:20])

[[b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
 [b'4.4' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'3.0' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.5' b'2.3' b'1.3' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.6' b'1.0' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.2' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.1' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.9' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.1' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.9' b'2.4' b'3.3' b'1.0' b'Iris-versicolor']
 [b'4.9' b'2.5' b'4.5' b'1.7' b'Iris-virginica']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]


## 45. 如何找到矩阵中某列出现频率最多的数值？

In [65]:
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution:
vals, counts = np.unique(iris[:, 2], return_counts=True)
print(vals, counts)
print(vals[np.argmax(counts)])

[b'1.0' b'1.1' b'1.2' b'1.3' b'1.4' b'1.5' b'1.6' b'1.7' b'1.9' b'3.0'
 b'3.3' b'3.5' b'3.6' b'3.7' b'3.8' b'3.9' b'4.0' b'4.1' b'4.2' b'4.3'
 b'4.4' b'4.5' b'4.6' b'4.7' b'4.8' b'4.9' b'5.0' b'5.1' b'5.2' b'5.3'
 b'5.4' b'5.5' b'5.6' b'5.7' b'5.8' b'5.9' b'6.0' b'6.1' b'6.3' b'6.4'
 b'6.6' b'6.7' b'6.9'] [ 1  1  2  7 12 14  7  4  2  1  2  2  1  1  1  3  5  3  4  2  4  8  3  5
  4  5  4  8  2  2  2  3  6  3  3  2  2  3  1  1  1  2  1]
b'1.5'


## 46. 如何找到比给定的值大的首次出现的位置?

In [66]:
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution: 第四列花瓣宽度 >1.0的第一个位置
np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

array([50], dtype=int64)

## 47. 如何将矩阵中大于或小于某些数值的值，替换为固定值？

In [69]:
# Input
np.set_printoptions(precision=2)
np.random.seed(100)
# 1~50，随机生成20个数
a = np.random.uniform(1,50, 20)
print(a)
print()
# Solution 1: Using np.clip
print(np.clip(a, a_min=10, a_max=30))
print()
# Solution 2: Using np.where
handle30 = np.where(a > 30, 30, a)
print(handle30)
print()
print(np.where(a < 10, 10,handle30))
# 简洁写法
print()
print(np.where(a < 10, 10, np.where(a > 30, 30, a)))

[27.63 14.64 21.8  42.39  1.23  6.96 33.87 41.47  7.7  29.18 44.67 11.25
 10.08  6.31 11.77 48.95 40.77  9.43 41.   14.43]

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]

[27.63 14.64 21.8  30.    1.23  6.96 30.   30.    7.7  29.18 30.   11.25
 10.08  6.31 11.77 30.   30.    9.43 30.   14.43]

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]


## 48. 如何从给定的array中获取最大的前5个数？

In [93]:
# Input
np.random.seed(100)
a = np.random.uniform(1,50, 20)
print(a)
print()
# Solution:
print(a.argsort()[::-1][:5])

# Solution 2:
print(np.argpartition(-a, 5)[:5])

# Below methods will get you the values.
# Method 1:
print(a[a.argsort()[::-1]][:5])
print()
print('升序排序最大的5个数')
# Method 2:
print(np.sort(a)[-5:])

# Method 3:
print(np.partition(a, kth=-5)[-5:])
print()
print('降序排序最大的5个数')
# Method 4:
print(a[np.argpartition(-a, 5)][:5])

[27.63 14.64 21.8  42.39  1.23  6.96 33.87 41.47  7.7  29.18 44.67 11.25
 10.08  6.31 11.77 48.95 40.77  9.43 41.   14.43]

[15 10  3  7 18]
[15 10  3  7 18]
[48.95 44.67 42.39 41.47 41.  ]

升序排序最大的5个数
[41.   41.47 42.39 44.67 48.95]
[41.   41.47 42.39 44.67 48.95]

降序排序最大的5个数
[48.95 44.67 42.39 41.47 41.  ]


## 49. 如何以行级别，获取1至10个数出现频次的计数？

In [98]:
# Input:
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
print(arr)
print()
# Solution
def counts_of_all_values_rowwise(arr2d):
    # Unique values and its counts row wise
    num_counts_array = [np.unique(row, return_counts=True) for row in arr2d]
    print(num_counts_array)
    # Counts of all values row wise
    return([[int(b[a==i]) if i in a else 0 for i in np.unique(arr2d)] for a, b in num_counts_array])

# Print
print(np.arange(1,11))
counts_of_all_values_rowwise(arr)

[[ 9  9  4  8  8  1  5  3  6  3]
 [ 3  3  2  1  9  5  1 10  7  3]
 [ 5  2  6  4  5  5  4  8  2  2]
 [ 8  8  1  3 10 10  4  3  6  9]
 [ 2  1  8  7  3  1  9  3  6  2]
 [ 9  2  6  5  3  9  4  6  1 10]]

[ 1  2  3  4  5  6  7  8  9 10]
[(array([1, 3, 4, 5, 6, 8, 9]), array([1, 2, 1, 1, 1, 2, 2], dtype=int64)), (array([ 1,  2,  3,  5,  7,  9, 10]), array([2, 1, 3, 1, 1, 1, 1], dtype=int64)), (array([2, 4, 5, 6, 8]), array([3, 2, 3, 1, 1], dtype=int64)), (array([ 1,  3,  4,  6,  8,  9, 10]), array([1, 2, 1, 1, 2, 1, 2], dtype=int64)), (array([1, 2, 3, 6, 7, 8, 9]), array([2, 2, 2, 1, 1, 1, 1], dtype=int64)), (array([ 1,  2,  3,  4,  5,  6,  9, 10]), array([1, 1, 1, 1, 1, 2, 2, 1], dtype=int64))]


[[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
 [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
 [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
 [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
 [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
 [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]]

## 50. 如何将多维array转换为1维？

In [101]:
# Input:
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])
print('array_of_arrays: ', array_of_arrays)
print()
# Solution 1
arr_2d = np.array([a for arr in array_of_arrays for a in arr])
print(arr_2d)
print()
# Solution 2:
arr_2d = np.concatenate(array_of_arrays)
print(arr_2d)

array_of_arrays:  [array([0, 1, 2]) array([3, 4, 5, 6]) array([7, 8, 9])]

[0 1 2 3 4 5 6 7 8 9]

[0 1 2 3 4 5 6 7 8 9]


## 51. 如何将数据转换为one-hot编码？

one-hot编码
分类变量（定量特征）与连续变量（定性特征）。我们训练模型的变量，一般分为两种形式。以年收入增长率为例，如果取值为0-1之间任意数，则此时变量为连续变量。如果把增长率进行分段处理，表示成如下形式：[0,0.3],(0.3,0.6],(0.6,1]，那么此时变量为分类变量。

特征转换。对于分类变量，建模时要进行转换，通常直接转换为数字。比如将[0,0.3],(0.3,0.6],(0.6,1]表示为0,1,2。原因主要有两点：

1,转换后可以提高模型运算效率。

2,对于一些模型，比如逻辑回归或计算距离时，无法对分类值直接进行计算。

直接转换为数字，也会带来一些问题：

1,转换为数字后，默认为连续变量，违背最初设计，影响效率。

2,转换后的值会影响同一特征在样本中的权重。比如转换为1000和转换为1对模型影响明显不同。

因此，需要更好的编码方式对特征进行转换。

one-hot编码。one-hot编码的定义是用N位状态寄存器来对N个状态进行编码。比如上面的例子[0,0.3],(0.3,0.6],(0.6,1]，有3个分类值，因此N为3，对应的one-hot编码可以表示为100,010,001。

使用步骤。比如用LR算法做模型，在数据处理过程中，可以先对连续变量进行离散化处理，然后对离散化后数据进行one-hot编码，最后放入LR模型中。这样可以增强模型的非线性能力。

In [104]:
# Input:
np.random.seed(101) 
arr = np.random.randint(1,4, size=6)
print(arr)
print()

# Solution:
# 如果需要one-hot的成员有m个，而这些成员的unique的值有n个
# 那么one-hot的矩阵就是m x n的
# 矩阵中，每一行的1是这样计算的，比如第一行对应数字2，唯一值的向量为[1 2 3]，
# 则1的索引位于1，即构成了[0 1 0]的行向量
# 以此类推
def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    for i, k in enumerate(arr):
        out[i, k-1] = 1
    return out

print(one_hot_encodings(arr))

[2 3 2 2 2 1]

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


## 52. 如何创建基于分类的分组行号？

如基于鸢尾花分类名称，作分组行号

In [105]:
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
print(species_small)

['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica']


In [112]:
print([val for val in np.unique(species_small)])
print([i+1 for val in np.unique(species_small) for i, grp in enumerate(species_small[species_small==val])])

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]


## 53. 如何基于给定的分类创建分组编号？

In [113]:
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
print(species_small)

['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica']


In [133]:
# Solution:
output = [np.argwhere(np.unique(species_small) == s).tolist()[0][0] 
          for val in np.unique(species_small) 
          for s in species_small[species_small==val]]
print(output)
print()
# Solution: For Loop version
output = []
uniqs = np.unique(species_small)
print(uniqs)
print()
for val in uniqs:  # uniq values in group
    for s in species_small[species_small==val]:  # each element in group
        groupid = np.argwhere(uniqs == s)[0][0]  # groupid
        output.append(groupid)

print(output)

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]


## 54. 如何对向量作从小到大顺序排名？

In [135]:
np.random.seed(10)
a = np.random.randint(20, size=10)
print('Array: ', a)

# Solution
print(a.argsort().argsort())

Array:  [ 9  4 15  0 17 16 17  8  9  0]
[4 2 6 0 8 7 9 3 5 1]


## 55. 如何对多维度的array中的成员作从小到大的顺序排名？

In [125]:
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

[[ 9  4 15  0 17]
 [16 17  8  9  0]]


In [132]:
# Solution
print('原矩阵')
print(a)
print('矩阵降为1维向量')
print(a.ravel())
print('给出从小到大数据成员的索引向量')
print(a.ravel().argsort())
print('对索引向量再从小到大排序，并给出对应的向量，这个就是排名的动作')
print(a.ravel().argsort().argsort())
print('reshape为原矩阵的行与列数目')
print(a.ravel().argsort().argsort().reshape(a.shape))

原矩阵
[[ 9  4 15  0 17]
 [16 17  8  9  0]]
矩阵降为1维向量
[ 9  4 15  0 17 16 17  8  9  0]
给出从小到大数据成员的索引向量
[3 9 1 7 0 8 2 5 4 6]
对索引向量再从小到大排序，并给出对应的向量，这个就是排名的动作
[4 2 6 0 8 7 9 3 5 1]
reshape为原矩阵的行与列数目
[[4 2 6 0 8]
 [7 9 3 5 1]]


## 56. 如何在一个二维矩阵找到每行的最大值？

In [3]:
# Input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
print(a)

# Solution 1
print(np.amax(a, axis=1))

# Solution 2
print(np.apply_along_axis(np.max, arr=a, axis=1))

[[9 9 4]
 [8 8 1]
 [5 3 6]
 [3 3 3]
 [2 1 9]]
[9 8 6 3 9]
[9 8 6 3 9]


## 57. 如何在一个二维矩阵对每行做到min-by-max？

即对每行求min/max的值

In [5]:
# Input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
print(a)

# Solution
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

[[9 9 4]
 [8 8 1]
 [5 3 6]
 [3 3 3]
 [2 1 9]]


array([0.44444444, 0.125     , 0.5       , 1.        , 0.11111111])

## 58. 如何在array发现重复的记录？

In [8]:
# Input
np.random.seed(100)
a = np.random.randint(0, 5, 10)
print(a)
print()
## Solution
# Create an all True array
out = np.full(a.shape[0], True)
print(out)
# Find the index positions of unique elements
unique_positions = np.unique(a, return_index=True)[1]

print()
print(unique_positions)
print()
# Mark those positions as False
out[unique_positions] = False

print(out)

[0 0 3 0 2 4 2 2 2 2]

[ True  True  True  True  True  True  True  True  True  True]
(array([0, 2, 3, 4]), array([0, 4, 2, 5], dtype=int64))

[0 4 2 5]

[False  True False  True False False  True  True  True  True]


## 59. 如何求分组后的均值？

In [20]:
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')


# Solution
# No direct way to implement this. Just a version of a workaround.
# 获得根据分组求均值的列
numeric_column = iris[:, 1].astype('float')  # sepalwidth
# 获得分组依据列，前提在于无空值，且与求均值的列的行数一致
grouping_column = iris[:, 4]  # species

# 输入的示例：
pairvalue = []
for specie, value in zip(iris[:, 4], iris[:, 1]):
    pairvalue.append([specie, value])
print(np.array(pairvalue)[:10])

# List comprehension version
print([[group_val, numeric_column[grouping_column==group_val].mean()] for group_val in np.unique(grouping_column)])

# For Loop version
output = []
for group_val in np.unique(grouping_column):
    output.append([group_val, numeric_column[grouping_column==group_val].mean()])

print(output)

[[b'Iris-setosa' b'3.5']
 [b'Iris-setosa' b'3.0']
 [b'Iris-setosa' b'3.2']
 [b'Iris-setosa' b'3.1']
 [b'Iris-setosa' b'3.6']
 [b'Iris-setosa' b'3.9']
 [b'Iris-setosa' b'3.4']
 [b'Iris-setosa' b'3.4']
 [b'Iris-setosa' b'2.9']
 [b'Iris-setosa' b'3.1']]
[[b'Iris-setosa', 3.418], [b'Iris-versicolor', 2.7700000000000005], [b'Iris-virginica', 2.974]]
[[b'Iris-setosa', 3.418], [b'Iris-versicolor', 2.7700000000000005], [b'Iris-virginica', 2.974]]


## 60. 如何将图片转为numpy array?

<img src='https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg' />

In [24]:
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
response = requests.get(URL)

# Read it as Image
I = Image.open(BytesIO(response.content))

# Optionally resize
I = I.resize([150,150])

# Convert to numpy array
arr = np.asarray(I)
# print(arr[:2])
# Optionaly Convert it back to an image and show
im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

## 61. 如何移除所有array中的空值？

In [25]:
a = np.array([1,2,3,np.nan,5,6,7,np.nan])
a[~np.isnan(a)]

array([1., 2., 3., 5., 6., 7.])

## 62. 如何计算两个array的欧式距离？

欧几里得度量（euclidean metric）（也称欧氏距离）是一个通常采用的距离定义，指在m维空间中两个点之间的真实距离，或者向量的自然长度（即该点到原点的距离）。在二维和三维空间中的欧氏距离就是两点之间的实际距离。

<img src='./image/eudistance.jpg' />

In [26]:
# Input
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

# Solution
dist = np.linalg.norm(a-b)
dist

6.708203932499369

## 63. 如何求得一维向量中的局部极值的位置？

In [32]:
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])
print(np.diff(a))
doublediff = np.diff(np.sign(np.diff(a)))
print(doublediff)
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations

[ 2  4 -6  1  4 -6  1]
[ 0 -2  2  0 -2  2]


array([2, 5], dtype=int64)

## 64. 已知一维向量，如何用二维矩阵的每一行分别减去一维向量对应的各个数值？

In [36]:
# Input
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
# 如a_2d，第一行减1，第二行减2，第三行减3
b_1d = np.array([1,2,3])

print(b_1d[:, None])
# Solution
print(a_2d - b_1d[:,None])

[[1]
 [2]
 [3]]
[[2 2 2]
 [2 2 2]
 [2 2 2]]


## 65. 如何从一个array找到某数第N个重复数的索引？

In [38]:
# 找到数字1，第五次重复位置的索引
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])
n = 5

# Solution 1: List comprehension
print([i for i, v in enumerate(x) if v == 1][n-1])

# Solution 2: Numpy version
print(np.where(x == 1)[0][n-1])

8
8


## 66. 如何将numpy的日期时间对象转换为python的日期时间对象？

In [39]:
# Input: a numpy datetime64 object
dt64 = np.datetime64('2018-02-25 22:10:10')

# Solution
from datetime import datetime
print(dt64.tolist())

# or

print(dt64.astype(datetime))

2018-02-25 22:10:10
2018-02-25 22:10:10


## 67. 如何计算array的移动平均值？

<img src='./image/moveaverage.jpg' />

In [47]:
# Solution
# Source: https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    print('cumsum is')
    print(ret)
    print('ret[3:] is ret[3:]: {0} - ret[:-n]:{1}'.format(ret[n:], ret[:-n]))
    ret[n:] = ret[n:] - ret[:-n]
    print('ret is {0}'.format(ret))
    print('ret[n:] is {0}'.format(ret[n:]))
    print('ret[n-1:] is {0}'.format(ret[n-1:]))
    return ret[n - 1:] / n

np.random.seed(100)
Z = np.random.randint(10, size=10)
print('array: ', Z)
# Method 1
print(moving_average(Z, n=3).round(2))

np.set_printoptions(precision=2)
# Method 2: 
# np.ones(3)/3 gives equal weights. Use np.ones(4)/4 for window size 4.
print(np.convolve(Z, np.ones(3)/3, mode='valid'))

array:  [8 8 3 7 7 0 4 2 5 2]
cumsum is
[ 8. 16. 19. 26. 33. 33. 37. 39. 44. 46.]
ret[3:] is ret[3:]: [26. 33. 33. 37. 39. 44. 46.] - ret[:-n]:[ 8. 16. 19. 26. 33. 33. 37.]
ret is [ 8. 16. 19. 18. 17. 14. 11.  6. 11.  9.]
ret[n:] is [18. 17. 14. 11.  6. 11.  9.]
ret[n-1:] is [19. 18. 17. 14. 11.  6. 11.  9.]
[6.33 6.   5.67 4.67 3.67 2.   3.67 3.  ]
[6.33 6.   5.67 4.67 3.67 2.   3.67 3.  ]


## 68. 如何创建一个给定开始值，给定等差，给定数量的array？

In [48]:
length = 10
start = 5
step = 3

def seq(start, length, step):
    end = start + (step*length)
    return np.arange(start, end, step)

seq(start, length, step)

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32])

## 69. 如何填充不规则序列的numpy的日期对象？

In [49]:
# Input
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

# Solution ---------------
filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
print(output)

# For loop version -------
out = []
for date, d in zip(dates, np.diff(dates)):
    out.append(np.arange(date, (date+d)))

filled_in = np.array(out).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
print(output)

['2018-02-01' '2018-02-03' '2018-02-05' '2018-02-07' '2018-02-09'
 '2018-02-11' '2018-02-13' '2018-02-15' '2018-02-17' '2018-02-19'
 '2018-02-21' '2018-02-23']
['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09' '2018-02-10'
 '2018-02-11' '2018-02-12' '2018-02-13' '2018-02-14' '2018-02-15'
 '2018-02-16' '2018-02-17' '2018-02-18' '2018-02-19' '2018-02-20'
 '2018-02-21' '2018-02-22' '2018-02-23']
['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09' '2018-02-10'
 '2018-02-11' '2018-02-12' '2018-02-13' '2018-02-14' '2018-02-15'
 '2018-02-16' '2018-02-17' '2018-02-18' '2018-02-19' '2018-02-20'
 '2018-02-21' '2018-02-22' '2018-02-23']


## 70. 如何创建列步长都为1，行步长不定长(strides)的矩阵？

In [58]:
def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    # return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

print(gen_strides(np.arange(15), stride_len=2, window_len=4))
print()
print(gen_strides(np.arange(15), stride_len=4))
print()
print(gen_strides(np.arange(15)))

[[ 0  1  2  3]
 [ 2  3  4  5]
 [ 4  5  6  7]
 [ 6  7  8  9]
 [ 8  9 10 11]
 [10 11 12 13]]

[[ 0  1  2  3  4]
 [ 4  5  6  7  8]
 [ 8  9 10 11 12]]

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
