# Introduction to NumPy

NumPy (Numerical Python) is a powerful open-source library for numerical computing in Python. It provides support for arrays, matrices, and a large collection of mathematical functions to operate on these data structures efficiently, including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation and much more.

The key of the NumPy package is the `ndarray` object. It encapsulates n-dimensional arrays of homogeneous data types, with many operations being performed in compiled code for performance. 

## Learning objectives

1. Attributes of an array
2. Creating numpy arrays
3. Reshaping
4. Indexing and slicing 
5. Joining and splitting 
6. Saving and loading

7. Universal functions
8. Broadcasting
9. Masking
10. Sorting
11. Vectorization and matrix multiplication

In [2]:
import numpy as np

## creating `ndarray`s
from lists, from scratch, from special constructors, from random variables, ...

In [3]:
np.array([1,2,3]) # vector

array([1, 2, 3])

In [4]:
x = np.array([1,2,3])
type(x)

numpy.ndarray

In [5]:
x.dtype

dtype('int64')

In [6]:
x.ndim # number of dimensions

1

In [7]:
x.shape # the size of each dimension

(3,)

In [8]:
x.size

3

In [9]:
x = np.array([[1,2], [4,2], [3,5]]) # matrix
x

array([[1, 2],
       [4, 2],
       [3, 5]])

In [10]:
x.shape # ( rows, columns )

(3, 2)

In [11]:
x = np.array([[3.14, 9.2, 1], [2,4,1]]) # one float will make all the numbers floats
x

array([[3.14, 9.2 , 1.  ],
       [2.  , 4.  , 1.  ]])

In [12]:
x.dtype

dtype('float64')

In [13]:
# use range to create arrays
np.array(range(5))

array([0, 1, 2, 3, 4])

In [14]:
np.arange(1, 10, 1) # ([start, end), step)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [15]:
np.arange(10,1,-1)

array([10,  9,  8,  7,  6,  5,  4,  3,  2])

In [16]:
np.arange(10, 1, -1).reshape(3,3)

array([[10,  9,  8],
       [ 7,  6,  5],
       [ 4,  3,  2]])

In [17]:
np.linspace(1, 10, 4) # linspace = linear space
# last element is number of numbers in list

array([ 1.,  4.,  7., 10.])

In [18]:
np.linspace([-1,1], [10,20], 5) # works across multiple dimensions

array([[-1.  ,  1.  ],
       [ 1.75,  5.75],
       [ 4.5 , 10.5 ],
       [ 7.25, 15.25],
       [10.  , 20.  ]])

In [19]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [20]:
np.zeros((5,3), dtype=int)

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [21]:
np.ones((5,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [22]:
# np.random = module, np.random.random = random function
np.random.random((4,3)) # np.random.random returns values in [0.0, 1.0).

array([[0.30114102, 0.97688324, 0.58006208],
       [0.74579815, 0.09527538, 0.77589962],
       [0.11255172, 0.74910402, 0.09600823],
       [0.20457707, 0.73817332, 0.32315741]])

In [23]:
np.random.rand(4,3)

array([[0.75726157, 0.26658002, 0.01408795],
       [0.19421757, 0.31880997, 0.84159177],
       [0.98423028, 0.94556291, 0.67353435],
       [0.40334668, 0.30125003, 0.21724478]])

In [24]:
np.random.randint(0,10,(4,3))

array([[8, 2, 1],
       [8, 9, 2],
       [3, 6, 5],
       [5, 2, 3]])

In [25]:
np.random.normal(0, 0.5, (4,3)) # specify mean, std, and size

array([[-0.32502924, -0.22736479, -0.05444273],
       [-0.22180949, -0.19904573, -0.12139116],
       [ 0.33687219, -0.70675315, -0.17800753],
       [ 0.50858754,  0.03236274, -0.34011927]])

In [26]:
np.random.uniform(0, 1, (4,3))

array([[0.97830062, 0.86093618, 0.68828441],
       [0.18306945, 0.33979989, 0.89781768],
       [0.89946932, 0.25476353, 0.48952666],
       [0.18738882, 0.17385405, 0.18106726]])

In [27]:
np.random.seed(1) # <- set seed like in R
np.random.multivariate_normal(mean=[10,20,30], cov=np.eye(3), size=4)
# np.eye = length of means vector

array([[11.62434536, 19.38824359, 29.47182825],
       [ 8.92703138, 20.86540763, 27.6984613 ],
       [11.74481176, 19.2387931 , 30.3190391 ],
       [ 9.75062962, 21.46210794, 27.93985929]])

In [28]:
a = np.eye(3)
a[0][2], a[2][0] = 1, 1
np.random.multivariate_normal(mean=[10,20,30], cov=a, size=4)
# same decimal bc correlated

array([[10.32241721, 19.61594565, 30.3224172 ],
       [11.09989126, 19.82757179, 31.09989127],
       [ 9.95778625, 20.58281521, 29.95778626],
       [ 8.85527629, 20.90159072, 28.85527629]])

## Reshaping
Use `reshape()` to reshape the array. Helpful in data operation.

In [29]:
x = np.random.randint(0, 10, 12)
x

array([3, 9, 2, 0, 4, 9, 2, 7, 7, 9, 8, 6])

In [30]:
x.shape

(12,)

In [31]:
# reshape to a row vector 
x.reshape(1,12)

array([[3, 9, 2, 0, 4, 9, 2, 7, 7, 9, 8, 6]])

In [32]:
# reshape to a column vector
x.reshape(12, 1)

array([[3],
       [9],
       [2],
       [0],
       [4],
       [9],
       [2],
       [7],
       [7],
       [9],
       [8],
       [6]])

In [33]:
# use -1
x.reshape(3,-1)
# -1 -> auto calc number of rows / columns

array([[3, 9, 2, 0],
       [4, 9, 2, 7],
       [7, 9, 8, 6]])

In [34]:
# extract a vector
x[0]

3

In [35]:
# flatten
x.flatten()

array([3, 9, 2, 0, 4, 9, 2, 7, 7, 9, 8, 6])

In [36]:
x[-1]

6

## Indexing and slicing

In [37]:
x.shape

(12,)

In [38]:
x[2]

2

In [39]:
x[-1]

6

In [40]:
x = x.reshape(3,4)
x

array([[3, 9, 2, 0],
       [4, 9, 2, 7],
       [7, 9, 8, 6]])

In [41]:
# extract a vector
x[0]

array([3, 9, 2, 0])

In [42]:
x[-1]

array([7, 9, 8, 6])

In [43]:
# extracting a scalar
x[0,3]

0

In [44]:
# replace values 
x[0,3] = 13
x

array([[ 3,  9,  2, 13],
       [ 4,  9,  2,  7],
       [ 7,  9,  8,  6]])

In [45]:
x[1] = np.array([-4,-5,-6,-7])
x

array([[ 3,  9,  2, 13],
       [-4, -5, -6, -7],
       [ 7,  9,  8,  6]])

In [46]:
x[0,0] = 0.0001 # maintain the data type
x

array([[ 0,  9,  2, 13],
       [-4, -5, -6, -7],
       [ 7,  9,  8,  6]])

### Slicing
Accessing subarrays
Syntax: `x[start:stop:step]`

In [47]:
x = np.random.randint(0,10,8)
x

array([9, 3, 7, 7, 4, 5, 9, 3])

In [48]:
x[1:]

array([3, 7, 7, 4, 5, 9, 3])

In [49]:
x[1:7:2]

array([3, 7, 5])

In [50]:
x[-1:-5:-3]

array([3, 4])

In [51]:
print(x[::-1]) #-1 step flips it
print(x)

[3 9 5 4 7 7 3 9]
[9 3 7 7 4 5 9 3]


In [52]:
x[::-2]

array([3, 5, 7, 3])

In [53]:
x = np.random.randint(0,20,(4,3))
x

array([[ 6,  8,  0],
       [ 2, 10, 15],
       [15,  7, 19],
       [10, 14,  0]])

In [54]:
x[:2,:2]

array([[ 6,  8],
       [ 2, 10]])

In [55]:
x[:2,:]

array([[ 6,  8,  0],
       [ 2, 10, 15]])

In [56]:
x[:,:2]

array([[ 6,  8],
       [ 2, 10],
       [15,  7],
       [10, 14]])

In [57]:
x[::-1, ::-1]

array([[ 0, 14, 10],
       [19,  7, 15],
       [15, 10,  2],
       [ 0,  8,  6]])

### Note: A slice is a view, not a copy

In [58]:
x = np.random.randint(0, 20, (5,4))
x

array([[ 1, 17, 13,  3],
       [ 0, 13,  6,  6],
       [ 2, 12, 11,  7],
       [13,  8, 11, 12],
       [11,  4,  7,  7]])

In [59]:
b = x[:3, 1:]
b

array([[17, 13,  3],
       [13,  6,  6],
       [12, 11,  7]])

In [60]:
b[0,0] = -15
b

array([[-15,  13,   3],
       [ 13,   6,   6],
       [ 12,  11,   7]])

In [61]:
x # change in b changes x too!!!!!

array([[  1, -15,  13,   3],
       [  0,  13,   6,   6],
       [  2,  12,  11,   7],
       [ 13,   8,  11,  12],
       [ 11,   4,   7,   7]])

Use the `copy` method to convert a view to a copy

In [62]:
x = np.random.randint(0, 20, (5,4))
x

array([[13,  4, 16, 18],
       [ 0, 13, 10, 17],
       [ 7, 10,  0, 12],
       [ 1,  9, 18, 19],
       [ 1,  2, 12,  7]])

In [63]:
b = x[:3, 1:].copy()
b

array([[ 4, 16, 18],
       [13, 10, 17],
       [10,  0, 12]])

In [64]:
b[0,0] *=100
b

array([[400,  16,  18],
       [ 13,  10,  17],
       [ 10,   0,  12]])

In [65]:
x

array([[13,  4, 16, 18],
       [ 0, 13, 10, 17],
       [ 7, 10,  0, 12],
       [ 1,  9, 18, 19],
       [ 1,  2, 12,  7]])

### Fancy indexing

In fancy or vectorized indexing, we pass arrays of indices in place of single scalars. This allows us to very quickly access and modify complicated subsets of an array’s values.

In [66]:
x = np.random.randint(0,10, 10)
x

array([2, 6, 0, 9, 2, 6, 6, 2, 7, 7])

In [67]:
[x[0], x[3], x[5]]

[2, 9, 6]

In [68]:
index = [0, 3, 5]
x[index]

array([2, 9, 6])

In [69]:
index = np.array([0,3,5])
x[index]

array([2, 9, 6])

In [70]:
x

array([2, 6, 0, 9, 2, 6, 6, 2, 7, 7])

When using arrays of indices, the shape of the result array reflects the shape of the index arrays rather than the shape of the original array. 

In [71]:
index = np.array([[0,3],[5,6]])
x[index]

array([[2, 9],
       [6, 6]])

In [72]:
x = np.random.randint(0,10,(3,4))
x

array([[0, 6, 5, 1],
       [4, 6, 0, 6],
       [5, 1, 2, 1]])

In [73]:
# index x[0,2], x[1,3]
row_idx = [0,2]
column_idx = [1,3]
x[row_idx, column_idx] 

array([6, 1])

### Boolean indexing

In [74]:
x = np.random.randint(0,10,(3,4))
x

array([[5, 4, 0, 7],
       [8, 9, 5, 7],
       [0, 9, 3, 9]])

In [75]:
x[x % 2 == 0]

array([4, 0, 8, 0])

In [76]:
x[x > 5]

array([7, 8, 9, 7, 9, 9])

In [77]:
x > 5

array([[False, False, False,  True],
       [ True,  True, False,  True],
       [False,  True, False,  True]])

## Combining arrays

Use `np.concatenate`, `np.vstack`, and `np.hstack` to concatenate or join two arrays in NumPy. 

In [78]:
x = np.array([1,2,3])
y = np.array([4,5,6])
np.concatenate((x,y))

array([1, 2, 3, 4, 5, 6])

In [79]:
z = np.array([7,8,9])
np.concatenate((x,y,z))

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [80]:
x = x.reshape(1,-1)
y = y.reshape(1,-1)
x,y

(array([[1, 2, 3]]), array([[4, 5, 6]]))

In [81]:
np.concatenate((x,y)) # default axis=0, concatenating based on rows

array([[1, 2, 3],
       [4, 5, 6]])

In [82]:
np.concatenate((x,y), axis=1)

array([[1, 2, 3, 4, 5, 6]])

`np.hstack` stacks arrays in sequence horizontally (column wise). axis=1

In [83]:
np.hstack((x,y))

array([[1, 2, 3, 4, 5, 6]])

In [84]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(2,3))
a,b

(array([[1, 4, 4],
        [6, 8, 8]]),
 array([[9, 2, 7],
        [5, 5, 4]]))

In [85]:
np.hstack((a, b)) # first 3 col are a, rest r b

array([[1, 4, 4, 9, 2, 7],
       [6, 8, 8, 5, 5, 4]])

In [86]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(2,2))
a,b

(array([[5, 8, 5],
        [8, 1, 1]]),
 array([[8, 7],
        [0, 3]]))

In [87]:
np.hstack((a,b))

array([[5, 8, 5, 8, 7],
       [8, 1, 1, 0, 3]])

In [88]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(3,2))
np.hstack((a,b)) # need the same dimensions

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2 and the array at index 1 has size 3

`np.vstack` stacks arrays in sequence vertically (row wise). axis=0

In [None]:
np.vstack((x,y))

array([[1, 2, 3],
       [4, 5, 6]])

In [None]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(2,3))
print(a)
print(b)
np.vstack((a,b))

[[0 6 3]
 [0 6 5]]
[[9 6 4]
 [6 6 2]]


array([[0, 6, 3],
       [0, 6, 5],
       [9, 6, 4],
       [6, 6, 2]])

In [None]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(1,3))
print(a)
print(b)
np.vstack((a,b))

[[2 4 1]
 [2 3 9]]
[[3 6 7]]


array([[2, 4, 1],
       [2, 3, 9],
       [3, 6, 7]])

In [None]:
a = np.random.randint(0,10,(2,3))
b = np.random.randint(0,10,(1,4))
print(a)
print(b)
np.vstack((a,b))

[[0 3 3]
 [6 8 6]]
[[5 1 3 2]]


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 3 and the array at index 1 has size 4

**Note**: `np.hstack` and `np.vstack` can be used to concatenate two arrays, and these arrays may have mixed dimensions. However, they require all the input array dimensions for the concatenation axis must match exactly. 

We can also use `np.r_` and `np.c_` for concatenation

In [None]:
np.r_[np.array([1,2,3]), np.array([4,5,6])] # r_ = ROW, requires arrays

array([1, 2, 3, 4, 5, 6])

In [None]:
np.r_[np.array([1,2,3]), 0, 0, np.array([4,5,6])]

array([1, 2, 3, 0, 0, 4, 5, 6])

In [None]:
np.r_[np.array([[1,2,3],[2,3,1]]), np.array([[4,5,6]])]

array([[1, 2, 3],
       [2, 3, 1],
       [4, 5, 6]])

In [None]:
np.c_[np.array([1,2,3]), np.array([4,5,6])] # c_ = COLUMN

array([[1, 4],
       [2, 5],
       [3, 6]])

In [None]:
np.c_[np.array([[1,2,3]]), np.array([[4,5,6]])]

array([[1, 2, 3, 4, 5, 6]])

In [None]:
np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]

array([[1, 2, 3, 0, 0, 4, 5, 6]])

## Splitting

Use `np.split`, `np.hsplit`, and `np.vsplit` to split an array. 

In [None]:
x = np.arange(10,20)
x

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [None]:
np.split(x, [3,5])

[array([10, 11, 12]), array([13, 14]), array([15, 16, 17, 18, 19])]

In [None]:
x = np.arange(10, 26).reshape(4,4)
x

array([[10, 11, 12, 13],
       [14, 15, 16, 17],
       [18, 19, 20, 21],
       [22, 23, 24, 25]])

In [None]:
np.hsplit(x, [2,3])

[array([[10, 11],
        [14, 15],
        [18, 19],
        [22, 23]]),
 array([[12],
        [16],
        [20],
        [24]]),
 array([[13],
        [17],
        [21],
        [25]])]

In [None]:
np.vsplit(x, [2,3])

[array([[10, 11, 12, 13],
        [14, 15, 16, 17]]),
 array([[18, 19, 20, 21]]),
 array([[22, 23, 24, 25]])]

## Saving and loading

Use `np.save` to store an array to a `.npy` file
Use `np.savez` to store several arrays
Use `np.load` to load arrays

In [None]:
x = np.arange(16).reshape(4,4)
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [None]:
np.save("x.npy", x)

In [None]:
np.load("x.npy")

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [None]:
np.savez("xy.npz", x=x, y=y)

In [None]:
arr = np.load("xy.npz")
arr

NpzFile 'xy.npz' with keys: x, y

In [None]:
arr['x']

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [None]:
arr['y']

array([[4, 5, 6]])

## Universal functions

NumPy provides an easy and flexible interface to optimize computation with arrays of data. The key to make the computation fast is to use **vectorized operations**, generally implemented through NumPy's **universal functions (ufuncs)**. 

In [None]:
%%timeit
x = list(np.random.random(100))
reciprocals = [1/i for i in x]

7.89 µs ± 8.74 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
%%timeit
x = np.random.random(100)
reciprocals = 1/x

1.55 µs ± 3.42 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


### Array Arithmetic

NumPy’s ufuncs feel very natural to use because they make use of Python’s native arithmetic operators. `+`, `-`, `*`, `/`, `//`, `**`, `%`, etc. 

In [None]:
x = np.arange(20).reshape(5,4)
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [None]:
x + 10

array([[10, 11, 12, 13],
       [14, 15, 16, 17],
       [18, 19, 20, 21],
       [22, 23, 24, 25],
       [26, 27, 28, 29]])

In [None]:
x - 100

array([[-100,  -99,  -98,  -97],
       [ -96,  -95,  -94,  -93],
       [ -92,  -91,  -90,  -89],
       [ -88,  -87,  -86,  -85],
       [ -84,  -83,  -82,  -81]])

In [None]:
x * 10

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110],
       [120, 130, 140, 150],
       [160, 170, 180, 190]])

In [None]:
x / 10

array([[0. , 0.1, 0.2, 0.3],
       [0.4, 0.5, 0.6, 0.7],
       [0.8, 0.9, 1. , 1.1],
       [1.2, 1.3, 1.4, 1.5],
       [1.6, 1.7, 1.8, 1.9]])

In [None]:
x**2

array([[  0,   1,   4,   9],
       [ 16,  25,  36,  49],
       [ 64,  81, 100, 121],
       [144, 169, 196, 225],
       [256, 289, 324, 361]])

In [None]:
x % 3

array([[0, 1, 2, 0],
       [1, 2, 0, 1],
       [2, 0, 1, 2],
       [0, 1, 2, 0],
       [1, 2, 0, 1]])

In [None]:
-(0.5*x + 1) ** 2

array([[  -1.  ,   -2.25,   -4.  ,   -6.25],
       [  -9.  ,  -12.25,  -16.  ,  -20.25],
       [ -25.  ,  -30.25,  -36.  ,  -42.25],
       [ -49.  ,  -56.25,  -64.  ,  -72.25],
       [ -81.  ,  -90.25, -100.  , -110.25]])

In [None]:
x = np.random.uniform(-1,1,5)
x

array([-0.85337503, -0.67732924,  0.24391908, -0.32489785, -0.63503652])

In [None]:
np.abs(x) # absolute value)

array([0.85337503, 0.67732924, 0.24391908, 0.32489785, 0.63503652])

In [None]:
x = np.random.uniform(0,1,5)
x

array([0.55542431, 0.43617176, 0.34518724, 0.20170065, 0.46109732])

In [None]:
np.exp(x) # exponential

array([1.74268026, 1.54677444, 1.41225433, 1.22348171, 1.58581318])

In [None]:
np.log2(x) # log

In [None]:
np.log10(x)

In [None]:
np.log(x) # ln

In [None]:
# Trigonometric Functions
x = np.linspace(0, np.pi, 3)
x

array([0.        , 1.57079633, 3.14159265])

In [None]:
np.sin(x)

array([0.0000000e+00, 1.0000000e+00, 1.2246468e-16])

In [None]:
np.cos(x)

array([ 1.000000e+00,  6.123234e-17, -1.000000e+00])

In [None]:
np.tan(x)

array([ 0.00000000e+00,  1.63312394e+16, -1.22464680e-16])

In [None]:
# Ufuncs aggregations
x = np.random.random(10000)

In [None]:
np.sum(x)

5044.466874793909

In [None]:
%%timeit
np.sum(x)

2.94 µs ± 18.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
%%timeit
sum(x)

403 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
np.min(x)

3.553069204764725e-05

In [None]:
np.max(x)

0.9999661326871546

In [None]:
np.mean(x)

0.5044466874793908

In [None]:
np.std(x)

0.28663415018096927

In [None]:
x.sum(), x.min(), x.max()

(5044.466874793909, 3.553069204764725e-05, 0.9999661326871546)

In [None]:
x.mean(), x.std()

(0.5044466874793908, 0.28663415018096927)

In [None]:
x = np.random.random(12).reshape(4,3)
x

array([[0.40075632, 0.53566772, 0.38071393],
       [0.25139224, 0.2031948 , 0.27542708],
       [0.45225989, 0.7374217 , 0.37287895],
       [0.34791373, 0.70684715, 0.82993006]])

In [None]:
np.sum(x)

5.494403554211621

In [None]:
%%timeit
np.sum(1 + 1)

1.86 µs ± 13.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [None]:
%%timeit
1 + 1

3.66 ns ± 0.0187 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)


In [None]:
np.sum(x, axis=0)

array([1.45232217, 2.18313137, 1.85895002])

In [None]:
np.sum(x, axis=1)

array([1.31713797, 0.73001412, 1.56256053, 1.88469093])

In [None]:
np.mean(x, axis=0) #axis0 = col

0.5044466874793908

In [None]:
np.mean(x, axis=1) #axis1 = row

array([0.43904599, 0.24333804, 0.52085351, 0.62823031])

**Note: Whenever possible, make sure that you are using the NumPy version of these aggregates when operating on NumPy arrays!**

In [89]:
# More ufuncs
x = np.random.randint(0, 10, 10)
x

array([7, 2, 8, 3, 0, 8, 4, 2, 9, 0])

In [90]:
np.cumsum(x)

array([ 7,  9, 17, 20, 20, 28, 32, 34, 43, 43])

In [91]:
np.argmax(x)

8

## Broadcasting

Broadcasting is a set of rules by which NumPy lets you apply binary operations (e.g., addition, subtraction, multiplication, etc.) between arrays of different sizes and shapes.

In [94]:
a = np.array([0, 1, 2])
b = np.array([5, 5, 5])
a + b

array([5, 6, 7])

In [98]:
# broadcast one array
# example of adding 1-d array and 2-d array
c = np.ones((3,3))
c

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [97]:
a = np.array([0, 1, 2])
c + a # a will be replicated to be added to c 

array([[1., 2., 3.],
       [1., 2., 3.],
       [1., 2., 3.]])

the one-dimensional array a is stretched, or broadcasted, across the second dimension in order to match the shape of c.

In [99]:
# broadcast two arrays
a = np.array([0,1,2])
b = np.array([0,1,2]).reshape(3,1)
a, b

(array([0, 1, 2]),
 array([[0],
        [1],
        [2]]))

In [100]:
a+b

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4]])

![NumPy_broadcasting.png](attachment:NumPy_broadcasting.png)

### Rules of broadcasting
- Rule 1: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.
- Rule 2: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.
- Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

In [101]:
a = np.ones((2,3))
b = np.array([0,1,2])
print(a)
print(b)

[[1. 1. 1.]
 [1. 1. 1.]]
[0 1 2]


In [102]:
# using rule 1 and 2
a+b

array([[1., 2., 3.],
       [1., 2., 3.]])

In [103]:
a = np.arange(3).reshape(3,1)
b = np.arange(3)
print(a)
print(b)

[[0]
 [1]
 [2]]
[0 1 2]


In [104]:
a+b

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4]])

In [116]:
a = np.ones((3,2))
b = np.array([0,1,2])
print("a: " + str(a))
print("b: " + str(b))

a: [[1. 1.]
 [1. 1.]
 [1. 1.]]
b: [0 1 2]


In [117]:
a+b

ValueError: operands could not be broadcast together with shapes (3,2) (3,) 

In [118]:
b + a

ValueError: operands could not be broadcast together with shapes (3,) (3,2) 

In [119]:
a = np.ones((3,2))
b = np.array([0,1,2]).reshape(-1,1)
a+b

array([[1., 1.],
       [2., 2.],
       [3., 3.]])

## In-class activity: calculate the variance from scratch

In [139]:
x = np.random.random(10)
x

array([0.3700842 , 0.62971751, 0.21017401, 0.75275555, 0.06653648,
       0.2603151 , 0.80475456, 0.19343428, 0.63946088, 0.52467031])

In [140]:
np.std(x)

0.24572413304001287

In [141]:
np.sqrt(np.sum((x - x.mean())**2) / len(x))

0.24572413304001287

## Masking

Masking comes up when you want to extract, modify, count, or manipulate values in an array based on some criterion: for example, you might wish to count all values greater than a certain value, or remove all outliers that are above some threshold. In NumPy, Boolean masking is often the most efficient way to accomplish these types of tasks.

In [142]:
x = np.random.randint(0,10,12).reshape(3,4)
x

array([[9, 0, 3, 4],
       [7, 5, 3, 8],
       [8, 0, 6, 7]])

### Comparison operators

In [143]:
x > 5

array([[ True, False, False, False],
       [ True, False, False,  True],
       [ True, False,  True,  True]])

In [144]:
# how many values are >5?
np.count_nonzero(x>5)

6

In [145]:
np.sum(x > 5)

6

In [146]:
np.sum(x > 5, axis=0)

array([3, 0, 1, 2])

In [147]:
np.sum(x > 5, axis=1)

array([1, 2, 3])

If we’re interested in quickly checking whether any or all the values are True, we can use `np.any` or `np.all`. 

In [148]:
np.any(x > 5)

True

In [149]:
np.all(x > 5)

False

In [150]:
np.all(x > -1)

True

### Boolean operators

In [151]:
# how many values are > 5 and <= 8
np.sum((x > 5) & (x <= 8)) # & = and

5

In [152]:
# how many values are >5 or < 8
np.sum((x > 5) | (x < 8)) # | = or

12

### Use Boolean arrays as masks

In [153]:
x[x>5]

array([9, 7, 8, 8, 6, 7])

In [154]:
x[(x>5) & (x<=7)]

array([7, 6, 7])

## Sorting 

In [155]:
x = np.random.randint(0,10,20)
x

array([9, 5, 4, 9, 5, 2, 5, 6, 6, 8, 7, 7, 7, 2, 6, 0, 5, 2, 1, 8])

In [156]:
sorted(x)

[0, 1, 2, 2, 2, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9]

In [157]:
np.sort(x)

array([0, 1, 2, 2, 2, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9])

In [158]:
# argsort eturns the indices of the sorted elements
np.argsort(x)

array([15, 18, 17,  5, 13,  2, 16,  1,  4,  6, 14,  8,  7, 12, 10, 11,  9,
       19,  3,  0])

In [159]:
x[3]

9

In [160]:
x[9]

8

## Matrix multiplication using vectorization

In [161]:
x = np.array([0,1,2])
y = np.array([2,3,4])
x*y

array([0, 3, 8])

### Dot product

The dot product of two vectors $\mathbf{x} = [x_1,x_2,x_3,..., x_n]$ and $\mathbf{y}=[y_1, y_2, y_3, ..., y_n]$, is defined as 
$$\mathbf{x} \cdot \mathbf{y} = \sum_{i=1}^n x_iy_i = x_1y_1 + x_2y_2 + ...+ x_ny_n$$

The dot product can be written as a matrix product. 

In [162]:
np.dot(x, y)

11

In [163]:
x @ y

11

In [166]:
x = np.array([0,1,2]).reshape(1,3)
x

array([[0, 1, 2]])

In [167]:
y = np.array([2,3,4]).reshape(1,3)

In [168]:
x @ y.T

array([[11]])

\begin{align*}
y_0 &= a + b_1x_{01} + b_2 x_{02}\\
y_1 &= a + b_1x_{11} + b_2 x_{12}\\
y_2 &= a + b_1x_{21} + b_2 x_{22}\\
\end{align*}

In [173]:
n = 3
p = 2
X = np.random.random((n,p))
b = np.random.random(p)
a = 0.5

In [174]:
X

array([[0.02388409, 0.9657268 ],
       [0.4299679 , 0.34628852],
       [0.57706763, 0.12652616]])

In [175]:
b

array([0.95004331, 0.31361083])

In [176]:
a

0.5

How to find y? 

Option 1: use nested loops

In [178]:
%%timeit
y = np.zeros(n)
for i in range(n):
    y[i] = a
    for j in range(p):
        y[i] += b[j]*X[i,j]
y

1.79 µs ± 32.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


alternative method 

In [179]:
# remove the inner loop 
y = np.zeros(n)
for i in range(n):
    y[i] = a + X[i,] @ b.T
y

array([0.82555331, 1.01708795, 1.08791922])

In [180]:
%%timeit
# remove the outer loop
y = a + X @ b.T
y

1.12 µs ± 11.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
