# Lecture 04: Introduction to Numpy

In [2]:
import numpy as np
# import pandas as pd

## Tổng quan

- Numpy (Numeric Python): là một thư viện toán học phổ biến và mạnh mẽ của Python. Cho phép làm việc hiệu quả với ma trận và mảng
- Các phần tử của array phải cùng kiểu dtype (ép kiểu sẽ xảy ra nếu cần thiết)
- Numpy array là partly mutable: có thể thay đổi các phần tử đang có, nhưng không thể thay đổi kích thước của cả array (no append, remove, delete)

### Tạo-numpy-array

In [6]:
# 1. From a list
l = [1, 2, 3, 4]

a = np.array(l)
print(a)
print(type(a))
print(a.dtype)

[1 2 3 4]
<class 'numpy.ndarray'>
int64


In [7]:
# 2. From a tuple
t = (1, 2, 3, 4)
a = np.array(t)

print(a)
print(type(a))
print(a.dtype)

[1 2 3 4]
<class 'numpy.ndarray'>
int64


In [7]:
t = [1,2,3]
# t2 = ['a']
a = np.array(t)

a

array([1, 2, 3])

In [8]:
# 3. Using numpy.arange()
a = np.arange(10)

print(a)
print(type(a))
print(a.dtype)

[0 1 2 3 4 5 6 7 8 9]
<class 'numpy.ndarray'>
int64


In [9]:
# 4. Muti-dimensional array
a = np.array([[1,2,3],[3,4,5]])

print(a)
print(type(a))
print(a.dtype)

[[1 2 3]
 [3 4 5]]
<class 'numpy.ndarray'>
int64


In [10]:
a.shape

(2, 3)

In [11]:
# Try to modify arrays (OK)
a = np.arange(9)
print(a)

a[0] = 99
print(a)

[0 1 2 3 4 5 6 7 8]
[99  1  2  3  4  5  6  7  8]


In [3]:
# Try to modify an array's size (Error)
a = np.arange(9)
print(a)

del a

[0 1 2 3 4 5 6 7 8]


In [13]:
# Convert bool to int
a = np.array([True, False, 1, 2])
print(a)
print(a.dtype)

[1 0 1 2]
int64


In [16]:
# Convert int to string
a = np.array([1, 2, 'A'])
print(a)
print(a.dtype)

['1' '2' 'A']
<U21


### Thao tác với numpy array

#### Lấy thông tin cơ bản

In [16]:
a = np.array([1, 2, 3])
b = np.array([[1, 2, 3], [3, 4, 5]])

In [17]:
print(a)
print(b)

[1 2 3]
[[1 2 3]
 [3 4 5]]


In [20]:
# Size
print(a.size)
print(b.size)

3
6


In [18]:
# dtype
print(a.dtype)
print(b.dtype)

int64
int64


In [22]:
print(a)
print(b)

[1 2 3]
[[1 2 3]
 [3 4 5]]


In [23]:
# Shape
print(a.shape)
print(b.shape)

(3,)
(2, 3)


In [24]:
# Dimension
print(a.ndim)
print(b.ndim)

1
2


#### Indexing & Slicing

##### 1-D array

In [19]:
a = np.arange(12)
print(a)

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [20]:
# 1st elem
a[0]

0

In [21]:
# Last elem
a[-1]

11

In [22]:
# 1st to 3rd
a[:3]

array([0, 1, 2])

In [23]:
# 3rd to last
a[2:]

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [4]:
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(a)
print(a[:2,1:3])

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[2 3]
 [6 7]]


In [27]:
print(a[:2,1:3])

[[2 3]
 [6 7]]


In [28]:
print(a.ndim)
print(a.shape)

2
(3, 4)


##### n-D array

In [4]:
# Note: try with order='F'
a = np.arange(9).reshape((3, 3))
print(a)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [33]:
# Elem at 0,0
a[0,0]

0

In [34]:
# 1st row
a[0,:]

array([0, 1, 2])

In [35]:
# last row
a[-1,:]

array([6, 7, 8])

In [36]:
# 1st col
a[:,0]

array([0, 3, 6])

In [37]:
# Last col
a[:,-1]

array([2, 5, 8])

In [38]:
# First and last col
a[:,[0,-1]]

array([[0, 2],
       [3, 5],
       [6, 8]])

In [39]:
# Everything
a[:,:]

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

## Array Math

In [40]:
import numpy as np

x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)
print(x)
print(y)

[[1. 2.]
 [3. 4.]]
[[5. 6.]
 [7. 8.]]


In [56]:
# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x,y))

[[ 6.  8.]
 [10. 12.]]
[[ 6.  8.]
 [10. 12.]]


In [57]:
# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x,y))

[[-4. -4.]
 [-4. -4.]]
[[-4. -4.]
 [-4. -4.]]


In [58]:
# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x,y))

[[ 5. 12.]
 [21. 32.]]
[[ 5. 12.]
 [21. 32.]]


In [59]:
# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print( x / y)
print(np.divide(x,y))

[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]


In [63]:
a = np.array([0, 0])
b = np.array([1, 1])
print(np.divide(b,a))
print(b/a)

[inf inf]
[inf inf]


  print(np.divide(b,a))
  print(b/a)


In [60]:
# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
np.sqrt(x)

array([[1.        , 1.41421356],
       [1.73205081, 2.        ]])

In [4]:
import numpy as np

x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11, 12])

In [11]:
print(v)
print(w)
print(x)
print(y)
# print(9*11+10*12)

[ 9 10]
[11 12]
[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


![img](https://algebra1course.files.wordpress.com/2013/02/slide10.jpg?w=640)

In [8]:
# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

219
219


In [13]:
print(x)
print(v)

[[1 2]
 [3 4]]
[ 9 10]


In [14]:
# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

[29 67]
[29 67]


In [12]:
# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

[[19 22]
 [43 50]]
[[19 22]
 [43 50]]


In [None]:
x = np.array([[1,2],[3,4]])
print(x)

# print(np.sum(x))  # Compute sum of all elements; prints "10"
# print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

In [None]:
x = np.array([[1,2], [3,4]])
print(x)    # Prints "[[1 2]
            #          [3 4]]"
print(x.T)  # Prints "[[1 3]
            #          [2 4]]"

In [None]:
# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])
print(v)    # Prints "[1 2 3]"
print(v.T)  # Prints "[1 2 3]"

In [None]:
1 2 3

In [None]:
1
2
3

## Vectorization (broadcasting)

In [43]:
import timeit

In [41]:
a = np.arange(10)
print(a)

[0 1 2 3 4 5 6 7 8 9]


In [44]:
# Add

print(a + 10)
# print(timeit.timeit(setup='import numpy as np',stmt='np.arange(10) + 10'))

# start_time = time.time()
# print("--- %s seconds ---" % (time.time() - start_time))

[10 11 12 13 14 15 16 17 18 19]


In [45]:
# Nếu dùng pure Python
print([x + 10 for x in a])
# print(timeit.timeit(setup='import numpy as np',stmt='[x + 10 for x in np.arange(10)]'))

# Nhược điểm:
# - Dài hơn
# - Chậm hơn (rất nhiều)

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [46]:
# Subtract
print(a - 10)

[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1]


In [47]:
# Multiply
print(a * 10)

[ 0 10 20 30 40 50 60 70 80 90]


In [48]:
# Exponential
print(a ** 10)

[         0          1       1024      59049    1048576    9765625
   60466176  282475249 1073741824 3486784401]


## Masking

In [49]:
# Note: phần tử array có thể khác vì dùng random
a = np.random.randint(-10, 10, size=10)
print(a)

[ -6   6 -10   6   5  -5   2  -4   3   6]


In [50]:
# Masking a > 0
print(a > 0)

[False  True False  True  True False  True False  True  True]


In [51]:
# Dùng masking để filter
a[a > 0]

array([6, 6, 5, 2, 3, 6])

In [52]:
# Dùng masking để filter (2)
print(a[a < 0]) 

[ -6 -10  -5  -4]


In [53]:
# Lấy các số chẵn


array([ -6,   6, -10,   6,   2,  -4,   6])

In [54]:
# Lấy các số không âm


array([6, 6, 5, 2, 3, 6])

In [55]:
mask = a > 5
print(mask)

[False  True False  True False False False False False  True]


In [56]:
# Kiểm tra với mọi (all)
np.all(a > 0)

False

In [57]:
# Kiểm tra tồn tại (any)
np.any(a > 0)

True

# Thống-kê-mô-tả

## 1-D array

In [58]:
a = np.arange(11)
print(a)

[ 0  1  2  3  4  5  6  7  8  9 10]


In [59]:
# Min
print(np.min(a))
print(a.min())

0
0


In [60]:
# Max
print(np.max(a))
print(a.max())

10
10


In [61]:
# Mean/Average
print(np.mean(a))
print(a.mean())

5.0
5.0


In [62]:
# Median
print(np.median(a))

# normal distribution/phan phoi chuan

5.0


In [63]:
b = np.array([1, 2, 3, 4, 5, 100])
b1 = np.array([1, 2, 3, 4, 5])

print(b.mean())
print(b1.mean())

print(np.median(b))
print(np.median(b1))

19.166666666666668
3.0
3.5
3.0


In [None]:
# Standard deviation/Normal Distribution
np.std()

## 2-D array

In [64]:
a = np.arange(100).reshape((10, 10))
print(a)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47 48 49]
 [50 51 52 53 54 55 56 57 58 59]
 [60 61 62 63 64 65 66 67 68 69]
 [70 71 72 73 74 75 76 77 78 79]
 [80 81 82 83 84 85 86 87 88 89]
 [90 91 92 93 94 95 96 97 98 99]]


In [65]:
# Mean row-wise (across rows)
print(np.mean(a,axis=0))
print(a.mean(axis=0))

[45. 46. 47. 48. 49. 50. 51. 52. 53. 54.]
[45. 46. 47. 48. 49. 50. 51. 52. 53. 54.]


In [66]:
# Mean column-wise (across columns)
print(np.mean(a,axis=1))
print(a.mean(axis=1))

[ 4.5 14.5 24.5 34.5 44.5 54.5 64.5 74.5 84.5 94.5]
[ 4.5 14.5 24.5 34.5 44.5 54.5 64.5 74.5 84.5 94.5]


In [67]:
import numpy as np
a = np.arange(10)
print(a)
a[a % 2 == 1] = -1
a

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

# Practising

In [None]:
'''
Create a 1D array of numbers from 0 to 9
'''

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
'''
Extract all odd numbers from arr
Replace all odd numbers in arr with -1
'''

[1 3 5 7 9]
[ 0 -1  2 -1  4 -1  6 -1  8 -1]


In [None]:
'''
Convert a 1D array to a 2D array with 2 rows
'''

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [None]:
'''
Get the common items between a and b using intersect1d
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
'''

array([2, 4])

In [None]:
'''
From array a remove all items present in array b using setdiff1d
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
'''

array([1, 2, 3, 4])

In [None]:
'''
Get all items between 5 and 10 from a
a = np.array([2, 6, 1, 9, 10, 3, 27])
'''

array([ 6,  9, 10])

In [None]:
'''
Swap columns 1 and 2 in the array arr
arr = np.arange(9).reshape(3,3)
'''

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [None]:
'''
Find the mean, median, standard deviation of iris’s sepallength (1st column)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
'''

In [19]:
sepallength

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

In [21]:
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# iris = np.genfromtxt(url, delimiter=',', dtype='object')
# sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])


5.843333333333334
5.8
0.8253012917851409


In [None]:
'''
Compute the maximum for row in the given array.
a = np.random.randint(1,10, [5,3])
'''

[[5 5 1]
 [1 1 7]
 [1 9 4]
 [5 5 8]
 [1 8 8]]


array([5, 9, 8])