In [1]:
from IPython.display import display
import numpy as np

# Introduction

* Dataset is often comprised of diverse data, including but not limited to numeric, text, or image data. In order to make sense of data, however, representing these data with **series of numbers** if very helpful.
* No matter the type of data, the very first step that we need to do is **converting such data into numbers**.
* This is where Python's _numpy_ and _Pandas_ become useful.

# Python List vs. Numpy Array

* Python list is heterogeneous (dynamic), which means the list in memory stores refenrence to list element's objects. It does not store the number themselves.
* Numpy array is a typed array, which means that it can't store heterogeneous data
* When accessing specific elements in a Python list, the list has to dereference the point. On the other hand, numpy array can be processed directly by numpy vector operations. This makes numpy operations much faster and efficient.

### Python List

In [7]:
#create a Python list
list1 = list(range(10))
list1

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
#dynamic typing is possible in a Python list
dynamic_list = [True, 1, '2', 3.0]
display(dynamic_list)
display([type(element) for element in dynamic_list])

[True, 1, '2', 3.0]

[bool, int, str, float]

### Numpy Array

In [10]:
#create a numpy array
array1 = np.array([1,2,3,4,5])
array1

array([1, 2, 3, 4, 5])

In [11]:
#print type of array
[type(element) for element in array1]

[numpy.int64, numpy.int64, numpy.int64, numpy.int64, numpy.int64]

In [12]:
#dynamic typing not supported
str_arr = np.array(['hello', 1, 2, 3])
str_arr

array(['hello', '1', '2', '3'], dtype='<U5')

In [20]:
#create multi-dimensional array
np.array([[2, 3, 4], [4, 5, 6], [6, 7, 8]])

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

In [30]:
#create zero array
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
#create array of ones
np.ones((3, 5), dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [32]:
# 3.14fh codns 3 by 5 array
np.full((3, 5), 3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [33]:
#start from 0 until 20, by 2
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [34]:
#create five intervals between 0 and 1
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [35]:
#created from uniform distribution from 0 to 1
np.random.random((3, 3))

array([[0.58108854, 0.51809251, 0.46712401],
       [0.64670427, 0.03148641, 0.71851141],
       [0.28848554, 0.41184509, 0.47027861]])

In [36]:
#created from normal distribution. mean = 0, dispersion = 1
np.random.normal(0, 1, (3, 3))

array([[ 1.22109069,  0.96394624, -1.28254243],
       [ 0.62211072,  0.10374409, -0.18838011],
       [-0.75287097, -0.75503518, -1.31658867]])

In [37]:
#random integer from 0 ~ 9
np.random.randint(0, 10, (3, 3))

array([[3, 2, 8],
       [4, 6, 9],
       [0, 0, 1]])

In [38]:
#create 3 by 3 identity matrix
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

# Numpy Basics

Essential basics of understanding numpy objects

### <span style="color:blue">**1. Numpy Array Characteristics**</span>

In [39]:
import numpy as np

np.random.seed(10) #for reappearance

x1 = np.random.randint(10, size=4) # 1-dimensional array
x1

array([9, 4, 0, 1])

In order to create the same random array, you need to run np.random.seed and np.random.randint again.

In [49]:
#create a two-dimensional array
x2 = np.random.randint(10, size=(4,5))
x2

array([[0, 6, 9, 1, 8],
       [9, 1, 2, 8, 9],
       [9, 5, 0, 2, 7],
       [3, 0, 4, 2, 0]])

In [50]:
#create a three-dimensional array
x3 = np.random.randint(10, size=(2,3,4))
x3

array([[[3, 3, 1, 2],
        [5, 9, 0, 1],
        [0, 1, 9, 0]],

       [[9, 2, 1, 1],
        [0, 0, 5, 9],
        [0, 4, 6, 6]]])

In [53]:
print("dimension of x3 :", x3.ndim)
print("shape of x3 :", x3.shape)
print("size of x3 : ", x3.size)
print("data type of x3 elements :", x3.dtype)
print("bytes of elements in x3 :", x3.itemsize, "bytes")
print("bytes of total x3 array :", x3.nbytes, "bytes")

dimension of x3 : 3
shape of x3 : (2, 3, 4)
size of x3 :  24
data type of x3 elements : int64
bytes of elements in x3 : 8 bytes
bytes of total x3 array : 192 bytes


### <span style="color:blue">**2. Accessing Elements**</span>

In [55]:
#One-dimensional array
print(x1)
print(x1[3])
print(x1[-1])

[9 4 0 1]
1
1


In [56]:
#two-dimensional array
print(x2)
print(x2[0, 0])
print(x2[2, -1])

[[0 6 9 1 8]
 [9 1 2 8 9]
 [9 5 0 2 7]
 [3 0 4 2 0]]
0
7


In [57]:
#changing array elements
x2[0, 0] = 100
x2

array([[100,   6,   9,   1,   8],
       [  9,   1,   2,   8,   9],
       [  9,   5,   0,   2,   7],
       [  3,   0,   4,   2,   0]])

In [59]:
#NumPy array is homogeneous. If you try to insert a float-value, the decimals will be dropped
x2[0, 1] = 1.2345
x2

array([[100,   1,   9,   1,   8],
       [  9,   1,   2,   8,   9],
       [  9,   5,   0,   2,   7],
       [  3,   0,   4,   2,   0]])

### <span style="color:blue">**3. Accessing sub-array**</span>

In [60]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [64]:
#first 5 elements
x[:5]

array([0, 1, 2, 3, 4])

In [65]:
#elements after index 5
x[5:]

array([5, 6, 7, 8, 9])

In [66]:
#middle array
x[4:7]

array([4, 5, 6])

slicing format => [starting index : last index (- 1) : step size ]

In [71]:
#step by 2
x[::2]

array([0, 2, 4, 6, 8])

In [72]:
#return odd elements
x[1::2]

array([1, 3, 5, 7, 9])

In [73]:
#negative step reverses the array
x[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [74]:
#reverses the array and step by 2
print(x[-1::-2])
print(x[9::-2])

[9 7 5 3 1]
[9 7 5 3 1]


In [75]:
#multi-dimensional array
x2

array([[100,   1,   9,   1,   8],
       [  9,   1,   2,   8,   9],
       [  9,   5,   0,   2,   7],
       [  3,   0,   4,   2,   0]])

In [77]:
#two rows, three columns
x2[:2, :3]

array([[100,   1,   9],
       [  9,   1,   2]])

In [78]:
#all rows, columns step by 2
x2[:4, ::2]

array([[100,   9,   8],
       [  9,   2,   9],
       [  9,   0,   7],
       [  3,   4,   0]])

In [79]:
#completely reverse the array
x2[::-1, ::-1]

array([[  0,   2,   4,   0,   3],
       [  7,   2,   0,   5,   9],
       [  9,   8,   2,   1,   9],
       [  8,   1,   9,   1, 100]])

In [81]:
#first column of x2
x2[:, 0]

array([100,   9,   9,   3])

In [83]:
#second row of x2
x2[1, :] # or x2[1]

array([9, 1, 2, 8, 9])

### <span style="color:blue">**4. Reshaping Array**</span>

The size of reshaped array must match that of the originial array in order to reshape properly

In [89]:
#create a one-dimensional array
grid_before = np.arange(1, 10)
display(grid_before)
display(grid_before.shape)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

(9,)

In [90]:
#change the array above to two dimensional (9, 1) array
np.reshape(grid_before, (9, 1))

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [91]:
#change to two-dimensional (3, 3) array
grid_before.reshape(3, 3)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [92]:
#create a one-dimensional array
x = np.array([1, 2, 3, 4, 5])
display(x)

#use newaxis to create row axis
display(x[np.newaxis, :])

#use newaxis to create column axis
display(x[:, np.newaxis])

array([1, 2, 3, 4, 5])

array([[1, 2, 3, 4, 5]])

array([[1],
       [2],
       [3],
       [4],
       [5]])

### <span style="color:blue">**5. Concatenate Arrays**</span>

In [94]:
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])

display(x)
display(y)
np.concatenate([x, y])

array([1, 2, 3])

array([3, 2, 1])

array([1, 2, 3, 3, 2, 1])

In [95]:
#create a two-dimensional array
grid = np.array([
        [1, 2, 3],
        [4, 5, 6]
    ])
grid

array([[1, 2, 3],
       [4, 5, 6]])

In [98]:
#concatenate along the 0 axis
np.concatenate([grid, grid], axis = 0)

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [99]:
#concatenate along the 1 axis
np.concatenate([grid, grid], axis = 1)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [100]:
#use vstack to concatenate multi-dimensional arrays vertically
x = np.array([1, 2, 3])
y = np.array([
        [9, 8, 7],
        [6, 5, 4]
    ])

display(x)
display(y)

np.vstack([x, y])

array([1, 2, 3])

array([[9, 8, 7],
       [6, 5, 4]])

array([[1, 2, 3],
       [9, 8, 7],
       [6, 5, 4]])

In [102]:
#use hstack to concatenate multi-dimensional arrays horizontally
z = np.array([
        [99],
        [99]
    ])

display(grid)
display(z)

np.hstack([z, grid])

array([[1, 2, 3],
       [4, 5, 6]])

array([[99],
       [99]])

array([[99,  1,  2,  3],
       [99,  4,  5,  6]])

### <span style="color:blue">**6. Splitting Arrays**</span>

In [103]:
#splitting one-dimensional array
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split(x, [3, 5]) #first split on the 3rd index, second split on the 5th index

display(x1)
display(x2)
display(x3)

array([1, 2, 3])

array([99, 99])

array([3, 2, 1])

In [105]:
#splitting multi-dimensional array
grid = np.arange(25).reshape((5, 5))

upper, middle, lower = np.vsplit(grid, [1, 3]) #first split on the 3rd index, second split on the 5th index

display(grid)
display(upper)
display(middle)
display(lower)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

array([[0, 1, 2, 3, 4]])

array([[ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

array([[15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

# Numpy Array Operations

Numpy array operations use universal functions (ufuncs) to conduct vectorized operation 

### <span style="color:blue">**1. Vectorized Operation**</span>

In [106]:
#use python list to conduct for-loop operations
x = [1, 2, 3, 4, 5]
length = len(x)

for i in range(length):
    print(1.0 / x[i])

1.0
0.5
0.3333333333333333
0.25
0.2


In [107]:
#this can easily be conducted using numpy's scalar and vector operations
1.0 / np.array(x)

array([1.        , 0.5       , 0.33333333, 0.25      , 0.2       ])

In [108]:
#this is not possible in a typical python list
1 / [1, 2, 3, 4, 5]

TypeError: unsupported operand type(s) for /: 'int' and 'list'

In [109]:
#vector to vector operation - same sized vectors

display(np.arange(5))
display(np.arange(1.0, 6.0))

display(np.arange(5) / np.arange(1.0, 6.0))

array([0, 1, 2, 3, 4])

array([1., 2., 3., 4., 5.])

array([0.        , 0.5       , 0.66666667, 0.75      , 0.8       ])

In [112]:
#multi-dimensional array
x = np.arange(9).reshape((3, 3))

display(x)
display(x*2)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

### <span style="color:blue">**2. Array Arithmatic Operations**</span>

In [113]:
x = np.arange(4)

display(x)
display(x + 1) # same as np.add(x, 1) -> a wrapper function of add function
display(x * 2) # np.multiply(x, 2)
display(x % 2) # np.mod(x, 2)

array([0, 1, 2, 3])

array([1, 2, 3, 4])

array([0, 2, 4, 6])

array([0, 1, 0, 1])

### <span style="color:blue">**3. Absolute Value Functions**</span>

In [114]:
x = np.array([-2, -1, 0, 1, 2])

#python's built-in function
abs(x)

array([2, 1, 0, 1, 2])

In [115]:
#numpy's vectorized operation
np.absolute(x)

array([2, 1, 0, 1, 2])

In [116]:
#alternative
np.absolute(x)

array([2, 1, 0, 1, 2])

### <span style="color:blue">**4. Trigonometric Functions**</span>

In [117]:
#create an array
theta = np.linspace(0, np.pi, 3) # 0, 90, 180

print("theta =", theta)
print("sin(theta) =", np.sin(theta))
print("cos(theta) =", np.cos(theta))
print("tan(theta) =", np.tan(theta))

theta = [0.         1.57079633 3.14159265]
sin(theta) = [0.0000000e+00 1.0000000e+00 1.2246468e-16]
cos(theta) = [ 1.000000e+00  6.123234e-17 -1.000000e+00]
tan(theta) = [ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]


### <span style="color:blue">**5. Inverse Trigonometric Functions**</span>

In [118]:
x = [-1, 0, 1]
print("x =", x)
print("arcsin(x) =", np.arcsin(x)) 
print("arccos(x) =", np.arccos(x)) 
print("arctan(x) =", np.arctan(x)) 

x = [-1, 0, 1]
arcsin(x) = [-1.57079633  0.          1.57079633]
arccos(x) = [3.14159265 1.57079633 0.        ]
arctan(x) = [-0.78539816  0.          0.78539816]


### <span style="color:blue">**6. Exponential Functions**</span>

In [119]:
x = [1, 2, 3]

print("x =", x) 
print("e^x =", np.exp(x)) 
print("2^x =", np.exp2(x)) 
print("3^x =", np.power(3, x)) 

x = [1, 2, 3]
e^x = [ 2.71828183  7.3890561  20.08553692]
2^x = [2. 4. 8.]
3^x = [ 3  9 27]


### <span style="color:blue">**7. Log Functions**</span>

In [120]:
import numpy as np
x = [1, 2, 4, 10]

print("x =", x) 
print("ln(x) =", np.log(x)) 
print("log2(x) =", np.log2(x)) 
print("log10(x) =", np.log10(x)) 

x = [1, 2, 4, 10]
ln(x) = [0.         0.69314718 1.38629436 2.30258509]
log2(x) = [0.         1.         2.         3.32192809]
log10(x) = [0.         0.30103    0.60205999 1.        ]


### <span style="color:blue">**8. Advanced Functions**</span>

1. Use of "out" parameter
    * Specifically designating where the array operation results will be allocated.
    * This prevents numpy from unnecessarily allocating new memory.
    * This becomes very memory-efficient in handling big vectorized operations
<br>
<br>
2. Use of Reduce
<br>
<br>
3. Use of Accumulate
<br>
<br>
4. Outer Products

In [122]:
x = np.arange(5)
y = np.empty(5)

display(x)
display(y)

np.multiply(x, 10, out = y)

display(y)

array([0, 1, 2, 3, 4])

array([0.0e+000, 4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323])

array([ 0., 10., 20., 30., 40.])

In [124]:
#create a new array
y = np.zeros(10)
display(x)
display(y)

#use the 0, 2, 4, 6, 8th index of array-y to allocate the results
np.power(2, x, out = y[::2])

display(y)

array([0, 1, 2, 3, 4])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

array([ 1.,  0.,  2.,  0.,  4.,  0.,  8.,  0., 16.,  0.])

In [125]:
x = np.arange(1, 6)
display(x)

#sum of all elements in array
print(np.add.reduce(x))

#products of all elements in array
print(np.multiply.reduce(x)) 

array([1, 2, 3, 4, 5])

15
120


In [126]:
#accumulate
a = np.add.accumulate(x)
display(a)

#accymylate
m = np.multiply.accumulate(x)
display(m)

array([ 1,  3,  6, 10, 15])

array([  1,   2,   6,  24, 120])

In [127]:
x = np.arange(1, 6)
display(x)

outer = np.multiply.outer(x, x)
display(outer)

array([1, 2, 3, 4, 5])

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

# Aggregation: Min, Max, and values in between

Very useful in basic EDA where a general summary of the data is necessary (mean, standard deviation, sum, multiples, median, min, max, etc)

### <span style="color:blue">**1. Sum of Array Elements**</span>

In [130]:
L = np.random.random(100)
display(L)

#Python's built-in function
print(sum(L))

#Use numpy(much faster)
print(np.sum(L))

array([0.57361827, 0.60220959, 0.07127345, 0.30465275, 0.4713651 ,
       0.42877537, 0.76396001, 0.23942578, 0.59326935, 0.21598771,
       0.32855583, 0.06827275, 0.72565378, 0.61223991, 0.11700494,
       0.6873665 , 0.14501862, 0.96252832, 0.77727822, 0.90335003,
       0.78437923, 0.50868687, 0.13197066, 0.22981003, 0.61674587,
       0.76894367, 0.53740504, 0.36771649, 0.66968552, 0.40675131,
       0.35695028, 0.70995113, 0.55727194, 0.99698976, 0.37384512,
       0.7009304 , 0.54398876, 0.44438346, 0.30071029, 0.61008044,
       0.64833149, 0.87061565, 0.59427772, 0.15171965, 0.81872157,
       0.16034803, 0.66069534, 0.17663505, 0.48419866, 0.71355456,
       0.48120129, 0.14075845, 0.73078739, 0.8122959 , 0.72382052,
       0.6717323 , 0.26411314, 0.27607332, 0.84169919, 0.51575165,
       0.82808948, 0.62973071, 0.54188732, 0.82561156, 0.6778163 ,
       0.69427663, 0.2191785 , 0.9003574 , 0.13161621, 0.12625124,
       0.15319693, 0.34380989, 0.00846691, 0.13208193, 0.72554

48.313848558976204
48.31384855897622


### <span style="color:blue">**2. Min and Max Values**</span>

In [131]:
big_array = np.random.rand(10000000)

#python built-in function
%timeit min(big_array), max(big_array)

print(min(big_array), max(big_array))

1.73 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.223387275985701e-07 0.9999999166642625


In [132]:
#use numpy (much faster)
%timeit np.min(big_array), np.max(big_array)

print(np.min(big_array), np.max(big_array))

10 ms ± 311 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.223387275985701e-07 0.9999999166642625


### <span style="color:blue">**3. Multi-dimensional Array**</span>

In [133]:
#two-dimensional array
Mat = np.random.random((3, 4))
display(Mat)

array([[0.08201706, 0.04819127, 0.14971277, 0.67552308],
       [0.99795557, 0.39816019, 0.4768481 , 0.58845301],
       [0.98863586, 0.85041383, 0.07240134, 0.7841354 ]])

In [138]:
#sum of all elements in the array
Mat.sum()

6.112447485034832

In [139]:
#this finds the smallest element in the array
Mat.min()

0.04819127229235731

In [140]:
#find the min value by column
display(Mat.min(axis = 0))

#find the min value by row
display(Mat.min(axis = 1))

array([0.08201706, 0.04819127, 0.07240134, 0.58845301])

array([0.04819127, 0.39816019, 0.07240134])

In [142]:
display(Mat.mean(axis = 0))

display(Mat.std(axis = 1))

array([0.68953616, 0.4322551 , 0.23298741, 0.68270383])

array([0.25474308, 0.23101118, 0.35502404])