# Notes for STATS102, Session 1, Sep 1st, Lecture
partial materials are from the text book 
Python Data Science Handbook: Essential Tools for Working with Data, by Jake VanderPlas, O'Reilly Media; 1 edition (December 10, 2016), ISBN-13: 978-1491912058, full text and code freely available at https://jakevdp.github.io/PythonDataScienceHandbook/.

# Subarrays as no-copy views

In [1]:
import numpy as np

In [2]:
x2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
print(x2)


[[7 0 7 7]
 [4 1 6 5]
 [7 9 9 3]]


In [3]:
x2_sub = x2[:2, 1:3]
print(x2_sub)

[[0 7]
 [1 6]]


In [4]:
x2_sub[0, 0] = 99
print(x2_sub)

[[99  7]
 [ 1  6]]


In [5]:
print(x2)

[[ 7 99  7  7]
 [ 4  1  6  5]
 [ 7  9  9  3]]


# Creating copies of arrays

In [6]:
x2_sub_copy = x2[:2, :2].copy()
print(x2_sub_copy)

[[ 7 99]
 [ 4  1]]


In [7]:
x2_sub_copy[0, 0] = 42
print(x2_sub_copy)

[[42 99]
 [ 4  1]]


In [8]:
print(x2)

[[ 7 99  7  7]
 [ 4  1  6  5]
 [ 7  9  9  3]]


# Reshaping

In [9]:
grid = np.arange(1, 13)
print(grid)
grid.ndim

[ 1  2  3  4  5  6  7  8  9 10 11 12]


1

In [11]:
grid.reshape(12,1)

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12]])

In [13]:
grid.reshape(1,12)

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])

In [10]:
grid.reshape(12,1).ndim

2

In [14]:
grid.reshape(1,12).ndim

2

# Array Concatenation and Splitting

In [15]:
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
np.concatenate([x, y])

array([1, 2, 3, 3, 2, 1])

In [16]:
x+y

array([4, 4, 4])

In [17]:
z = [99, 99, 99]
print(np.concatenate([x, y, z]))


[ 1  2  3  3  2  1 99 99 99]


In [18]:
grid = np.array([[1, 2, 3],[4, 5, 6]])
print(grid)

[[1 2 3]
 [4 5 6]]


In [19]:
np.concatenate([grid, grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [20]:
np.vstack([grid,grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [21]:
np.concatenate([grid, grid], axis=1)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [22]:
x = np.array([1, 2, 3])
grid = np.array([[9, 8, 7],
                 [6, 5, 4]])

# vertically stack the arrays
np.vstack([x, grid])

array([[1, 2, 3],
       [9, 8, 7],
       [6, 5, 4]])

In [23]:
# horizontally stack the arrays
y = np.array([[99],
              [99]])
np.hstack([grid, y])

array([[ 9,  8,  7, 99],
       [ 6,  5,  4, 99]])

In [24]:
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5])
print(x1, x2, x3)

[1 2 3] [99 99] [3 2 1]


In [25]:
x1

array([1, 2, 3])

In [26]:
grid = np.arange(16).reshape((4, 4))
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [28]:
upper, lower = np.vsplit(grid, [2])
print('upper:\n',upper)
print('lower:\n',lower)

upper:
 [[0 1 2 3]
 [4 5 6 7]]
lower:
 [[ 8  9 10 11]
 [12 13 14 15]]


In [29]:
left, right = np.hsplit(grid, [3])
print('left:\n',left)
print('right:\n',right)

left:
 [[ 0  1  2]
 [ 4  5  6]
 [ 8  9 10]
 [12 13 14]]
right:
 [[ 3]
 [ 7]
 [11]
 [15]]


In [30]:
np.hstack([left,right])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

# optimized computation with arrays of data.
vectorized operations, universal functions

In [32]:
np.random.seed(0)

def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0 / values[i]
    return output
        
values = np.random.randint(1, 10, size=5)
compute_reciprocals(values)

array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])

In [33]:
big_array = np.random.randint(1, 100, size=1000000)
%timeit compute_reciprocals(big_array)

4.25 s ± 962 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
print(1.0 / big_array)

[0.1        0.01190476 0.04545455 ... 0.01428571 0.01098901 0.01149425]


In [35]:
%timeit (1.0 / big_array)

8.96 ms ± 2.98 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [37]:
x = np.arange(9).reshape((3, 3))
print(x)
2 ** x

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[  1,   2,   4],
       [  8,  16,  32],
       [ 64, 128, 256]], dtype=int32)

In [38]:
x = np.arange(4)
print("x     =", x)
print("x + 5 =", x + 5)
print("x - 5 =", x - 5)
print("x * 2 =", x * 2)
print("x / 2 =", x / 2)
print("x // 2 =", x // 2)  # int(division)
print("-x     = ", -x)
print("x ** 2 = ", x ** 2)
print("x % 2  = ", x % 2)

x     = [0 1 2 3]
x + 5 = [5 6 7 8]
x - 5 = [-5 -4 -3 -2]
x * 2 = [0 2 4 6]
x / 2 = [0.  0.5 1.  1.5]
x // 2 = [0 0 1 1]
-x     =  [ 0 -1 -2 -3]
x ** 2 =  [0 1 4 9]
x % 2  =  [0 1 0 1]


In [None]:
x+2

In [39]:
x = np.array([-2, -1, 0, 1, 2])
abs(x)

array([2, 1, 0, 1, 2])

In [40]:
x = np.array([3 - 4j, 4 - 3j, 2 + 0j, 0 + 1j])
np.abs(x)

array([5., 5., 2., 1.])

# Trigonometric function

In [43]:
theta = np.linspace(0, np.pi, 3)
theta

array([0.        , 1.57079633, 3.14159265])

In [41]:
np.linspace?

In [44]:
print("theta      = ", theta)
print("sin(theta) = ", np.sin(theta))
print("cos(theta) = ", np.cos(theta))
print("tan(theta) = ", np.tan(theta))

theta      =  [0.         1.57079633 3.14159265]
sin(theta) =  [0.0000000e+00 1.0000000e+00 1.2246468e-16]
cos(theta) =  [ 1.000000e+00  6.123234e-17 -1.000000e+00]
tan(theta) =  [ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]


In [None]:
x = [-1, 0, 1]
print("x         = ", x)
print("arcsin(x) = ", np.arcsin(x))
print("arccos(x) = ", np.arccos(x))
print("arctan(x) = ", np.arctan(x))

In [45]:
x = [1, 2, 3]
print("x     =", x)
print("e^x   =", np.exp(x))
print("2^x   =", np.exp2(x))
print("3^x   =", np.power(3, x))

x     = [1, 2, 3]
e^x   = [ 2.71828183  7.3890561  20.08553692]
2^x   = [2. 4. 8.]
3^x   = [ 3  9 27]


In [None]:
x = [1, 2, 4, 10]
print("x        =", x)
print("ln(x)    =", np.log(x))
print("log2(x)  =", np.log2(x))
print("log10(x) =", np.log10(x))

In [34]:
x = np.arange(5)
print(x)
y = np.empty(5)
print(y)
#y = np.empty(5,dtype=int)
np.multiply(x, 11, out=y)
print(y)
z=np.multiply(x, 11)
print(z)

[0 1 2 3 4]
[ 0. 11. 22. 33. 44.]
[ 0 11 22 33 44]
[ 0 11 22 33 44]


In [35]:
x

array([0, 1, 2, 3, 4])

In [36]:
x = np.arange(1, 6)
print(x)
np.multiply.reduce(x)

[1 2 3 4 5]


120

In [37]:
np.add.reduce(x)

15

In [38]:
N=5
n=np.arange(1,N+1)
np.multiply.reduce(n)


120

In [39]:
np.add.reduce(x)

15

In [40]:
x.sum()

15

In [41]:
x

array([1, 2, 3, 4, 5])

In [42]:
np.multiply.accumulate(x)


array([  1,   2,   6,  24, 120], dtype=int32)

In [43]:
x = np.arange(1, 6)
np.multiply.accumulate(x)
#for index i, it is 1*2*...*(i+1)

array([  1,   2,   6,  24, 120], dtype=int32)

In [53]:
x = np.arange(1, 6)
print(x)
z=np.multiply.outer(x, x)
print(z)


[1 2 3 4 5]
[[ 1  2  3  4  5]
 [ 2  4  6  8 10]
 [ 3  6  9 12 15]
 [ 4  8 12 16 20]
 [ 5 10 15 20 25]]


# Summing the Values in an Array, Minimum and Maximum

In [44]:
L = np.random.random(10000)
sum(L)/len(L)


0.4956820292713477

In [45]:
np.random.random(100)

array([0.00271255, 0.87779441, 0.57395988, 0.29094769, 0.1236817 ,
       0.43395555, 0.98715109, 0.2006742 , 0.80967959, 0.7731717 ,
       0.40583617, 0.59243481, 0.8227229 , 0.27078305, 0.76452158,
       0.14643055, 0.03728   , 0.85619935, 0.07704888, 0.9992665 ,
       0.75108109, 0.29972499, 0.78877915, 0.62663557, 0.97940721,
       0.15483318, 0.27440339, 0.49781792, 0.54728203, 0.41122882,
       0.98206403, 0.70512247, 0.77906407, 0.72950852, 0.31332817,
       0.9646116 , 0.07874534, 0.36149224, 0.24959854, 0.29480116,
       0.6761102 , 0.90687565, 0.32430658, 0.28764998, 0.20262678,
       0.02493125, 0.23496332, 0.21541711, 0.56361281, 0.01402005,
       0.44123681, 0.6182407 , 0.63579684, 0.69439923, 0.92463139,
       0.37391577, 0.07761841, 0.94264377, 0.32763096, 0.7442008 ,
       0.24746723, 0.42832028, 0.60974124, 0.31064725, 0.90543289,
       0.00749058, 0.43053254, 0.10416977, 0.46554156, 0.23248306,
       0.79348931, 0.54823164, 0.71866044, 0.71571695, 0.53408

In [46]:
np.sum(L)

4956.820292713481

In [47]:
big_array = np.random.rand(1000000)
sum(big_array)

500166.91703737207

In [48]:
big_array = np.random.rand(1000000)
%timeit sum(big_array)


154 ms ± 4.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
%timeit np.sum(big_array)

1.21 ms ± 42.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit min(big_array)


In [None]:
%timeit np.min(big_array)

In [None]:
%timeit [big_array.min(), big_array.max(), big_array.sum()]

In [51]:
M = np.random.random((3, 4))
print(M)

[[0.33543209 0.67279492 0.30320283 0.72828165]
 [0.21676906 0.10977485 0.81790623 0.3228101 ]
 [0.35036929 0.73649026 0.84302586 0.19203221]]


In [52]:
np.sum(M)

5.628889351502403

In [53]:
M.sum()

5.628889351502403

In [54]:
M.sum(axis=0)

array([0.90257045, 1.51906002, 1.96413492, 1.24312396])

In [55]:
M.sum(axis=1)

array([2.03971149, 1.46726024, 2.12191762])

In [56]:
M.mean(axis=0)

array([0.30085682, 0.50635334, 0.65471164, 0.41437465])

# Broadcasting

In [57]:
a = np.array([0, 1, 2])
a + np.array([5,5])

ValueError: operands could not be broadcast together with shapes (3,) (2,) 

In [58]:
M = np.ones((3, 3))
M

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [59]:
a

array([0, 1, 2])

In [60]:
M + a

array([[1., 2., 3.],
       [1., 2., 3.],
       [1., 2., 3.]])

In [61]:
a = np.arange(4)
b = np.arange(6).reshape(3,2)

print(a)
print(b)

[0 1 2 3]
[[0 1]
 [2 3]
 [4 5]]


In [62]:
a + b

ValueError: operands could not be broadcast together with shapes (4,) (3,2) 

In [63]:
M = np.ones((2, 3))
M


array([[1., 1., 1.],
       [1., 1., 1.]])

In [64]:
a = np.arange(3)

a

array([0, 1, 2])

In [65]:
M+a

array([[1., 2., 3.],
       [1., 2., 3.]])

example application of broadcasting

In [66]:
M = np.ones((2, 3))
a = np.arange(3)
M + a

array([[1., 2., 3.],
       [1., 2., 3.]])

In [67]:
a = np.arange(3).reshape((3, 1))
b = np.arange(3)
a + b

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4]])

In [None]:
a

In [None]:
b

In [None]:
M = np.ones((2, 3))
a = np.arange(3)
M + a

In [68]:
M = np.ones((2, 3),dtype=int)
M

array([[1, 1, 1],
       [1, 1, 1]])

In [69]:
a

array([[0],
       [1],
       [2]])

In [70]:
a.reshape(1,3)

array([[0, 1, 2]])

In [71]:
M

array([[1, 1, 1],
       [1, 1, 1]])

In [72]:
X = np.random.random((10, 3))
Xmean = X.mean(axis=0)
Xmean

array([0.51269544, 0.51818748, 0.49792282])

In [75]:
X

array([[0.565991  , 0.97430803, 0.64088105],
       [0.41732328, 0.95960926, 0.63491114],
       [0.42424131, 0.03168592, 0.41201771],
       [0.51928529, 0.47272461, 0.18317717],
       [0.85401803, 0.92774505, 0.92775868],
       [0.97609956, 0.45962474, 0.32467649],
       [0.24194972, 0.91720064, 0.65022195],
       [0.67436093, 0.35832047, 0.25073414],
       [0.23261626, 0.02980692, 0.12396944],
       [0.22106905, 0.05084914, 0.83088038]])

In [73]:
Xmean = X.mean(axis=1)
Xmean

array([0.72706003, 0.67061456, 0.28931498, 0.39172903, 0.90317392,
       0.58680026, 0.60312411, 0.42780518, 0.12879754, 0.36759952])

In [74]:
a

array([[0],
       [1],
       [2]])

In [76]:
X.mean(0)

array([0.51269544, 0.51818748, 0.49792282])

In [77]:
np.sum(X[:,0]-0.6745)

-1.6180455775624147

In [78]:
X_centered = X - X.mean(0)
X_centered.mean(0)

array([-1.11022302e-16,  6.66133815e-17,  6.66133815e-17])

# Comparisons, Masks, and Boolean Logic

In [93]:
x = np.array([1, 2, 3, 4, 5])

In [81]:
(x < 3)

array([ True,  True, False, False, False])

In [82]:
x > 3  # greater than

array([False, False, False,  True,  True])

In [83]:
x <= 3  # less than or equal

array([ True,  True,  True, False, False])

In [84]:
x >= 3  # greater than or equal

array([False, False,  True,  True,  True])

In [85]:
x != 3  # not equal

array([ True,  True, False,  True,  True])

In [86]:
x == 3  # equal

array([False, False,  True, False, False])

In [87]:
x = np.random.randint(10, size=(3, 4))
x

array([[0, 5, 3, 9],
       [2, 4, 5, 0],
       [6, 5, 2, 3]])

In [81]:
x<6

array([[False,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True, False, False,  True]])

In [82]:
np.sum(x<6)

9

In [83]:
# how many values less than 6?
np.count_nonzero(x < 6)

9

In [84]:
np.sum(x < 6, axis=0)

array([2, 2, 2, 3])

In [85]:
np.sum(x < 6, axis=1)

array([3, 4, 2])

In [92]:
x<8

array([ True,  True,  True,  True,  True])

In [93]:
# are there any values greater than 8?
np.any(x >= 8)

False

In [94]:
# are there any values greater than 8?
np.all(x > 8)

False

In [None]:
np.all(x < 8, axis=0)

# Boolean operators

In [86]:
x

array([[6, 0, 3, 5],
       [4, 4, 2, 0],
       [0, 9, 8, 0]])

In [88]:
x>=3

array([[False,  True,  True,  True],
       [False,  True,  True, False],
       [ True,  True, False,  True]])

In [89]:
x<=5

array([[ True,  True,  True, False],
       [ True,  True,  True,  True],
       [False,  True,  True,  True]])

In [94]:
(x >= 3) & (x <= 5)

array([False, False,  True,  True,  True])

In [95]:
y=4
(y>=4) and (y<=5)

True

In [96]:
(x >= 3) and (x <= 5)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [97]:
((x >= 3) | (x <= 5))

array([ True,  True,  True,  True,  True])

In [98]:
((x >= 3) or (x <= 5))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [99]:
x

array([1, 2, 3, 4, 5])

In [100]:
x<5

array([ True,  True,  True,  True, False])

In [101]:
x[x < 5]

array([1, 2, 3, 4])

"and" and "or" judge the truth or falsehood of entire object, while & and | refer to bits within each object.

In [102]:
x = np.arange(10)
(x > 4) & (x < 8)

array([False, False, False, False, False,  True,  True,  True, False,
       False])

# Fancy Indexing
 it means passing an array of indices to access multiple array elements at once

In [103]:
x = np.random.randint(100, size=10)
print(x)

[ 9 52 52 37 81 38 66 26 60  8]


In [104]:
[x[0], x[2], x[4]]

[9, 52, 81]

In [105]:
x[[0,2,4]]

array([ 9, 52, 81])

In [106]:
x

array([ 9, 52, 52, 37, 81, 38, 66, 26, 60,  8])

In [107]:
ind = np.array([[0, 2],# the shape of the result reflects the shape of the index arrays 
                [1, 3]])
x[ind]

array([[ 9, 52],
       [52, 37]])

In [108]:
X = np.arange(12).reshape((3, 4))
X

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [109]:
row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
X[row, col]

array([ 2,  5, 11])

In [110]:
x = np.arange(10)
i = np.array([0, 1, 2, 3])
x[i] = 99
print(x)

[99 99 99 99  4  5  6  7  8  9]


# sort

In [116]:
x = np.array([2, 1, 4, 3, 5])
y=np.sort(x)
y

array([1, 2, 3, 4, 5])

In [115]:
x

array([2, 1, 4, 3, 5])

In [24]:
np.sort?

In [117]:
y[::-1]

array([5, 4, 3, 2, 1])

In [118]:
np.argsort(x)

array([1, 0, 3, 2, 4], dtype=int64)

In [119]:
x

array([2, 1, 4, 3, 5])

In [120]:
x.sort()
x

array([1, 2, 3, 4, 5])

In [121]:
x[::-1]

array([5, 4, 3, 2, 1])

In [122]:
X = np.random.randint(0, 10, (4, 6))
print(X)

[[4 7 8 4 9 1]
 [7 2 8 3 5 8]
 [0 4 5 6 1 1]
 [5 5 8 4 0 9]]


In [123]:
np.sort(X, axis=0)

array([[0, 2, 5, 3, 0, 1],
       [4, 4, 8, 4, 1, 1],
       [5, 5, 8, 4, 5, 8],
       [7, 7, 8, 6, 9, 9]])

In [124]:
np.sort(X, axis=1)

array([[1, 4, 4, 7, 8, 9],
       [2, 3, 5, 7, 8, 8],
       [0, 1, 1, 4, 5, 6],
       [0, 4, 5, 5, 8, 9]])

In [125]:
A=np.arange(25).reshape(5,5)
A

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [126]:
B=A[[1,0,2,3,4]]
B

array([[ 5,  6,  7,  8,  9],
       [ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [127]:
B=A[:,[1,0,2,3,4]]
B

array([[ 1,  0,  2,  3,  4],
       [ 6,  5,  7,  8,  9],
       [11, 10, 12, 13, 14],
       [16, 15, 17, 18, 19],
       [21, 20, 22, 23, 24]])