# Introduction to Python Data Analytics
# Part 1: Numpy

Author: Kang P. Lee <br>
References:
- NumPy official website (http://www.numpy.org/) 
- Python Data Science Handbook by Jake VanderPlas (http://shop.oreilly.com/product/0636920034919.do)

## ▪ Importing the NumPy Library

In [1]:
import numpy as np

## ▪ Creating NumPy Arrays from Python Lists

In [2]:
x = np.array([1, 2, 3, 4, 5])
x

array([1, 2, 3, 4, 5])

In [3]:
x = np.array([1, 2, 3, "4", "5"])
x

array(['1', '2', '3', '4', '5'], 
      dtype='<U11')

Unlike Python lists, NumPy does not allow the elements of different types. If types do not match, NumPy will upcast if possible (here, integers are up-cast to strings)

In [5]:
x = np.array([1, 2, 3, "4", "5"], dtype=int)
x

array([1, 2, 3, 4, 5])

In [6]:
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
x

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

## ▪ Creating NumPy Arrays from Scratch

In [7]:
x = np.zeros(10, dtype=int)         # Create a length-10 integer array filled with zeros
x

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
x = np.ones((5, 5), dtype=float)    # Create a 3x5 floating-point array filled with ones
x

array([[ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.]])

In [9]:
x = np.full((5, 5), 3.14)           # Create a 3x5 array filled with 3.14
x

array([[ 3.14,  3.14,  3.14,  3.14,  3.14],
       [ 3.14,  3.14,  3.14,  3.14,  3.14],
       [ 3.14,  3.14,  3.14,  3.14,  3.14],
       [ 3.14,  3.14,  3.14,  3.14,  3.14],
       [ 3.14,  3.14,  3.14,  3.14,  3.14]])

In [18]:
x = np.arange(0, 10, 2)             # Create an array filled with a linear sequence ranging from 0 to 10, stepping by 2
x

array([0, 2, 4, 6, 8])

In [15]:
x = np.linspace(0, 1, 101)            # Create an array of five values evenly spaced between 0 and 1
x

array([ 0.  ,  0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,
        0.09,  0.1 ,  0.11,  0.12,  0.13,  0.14,  0.15,  0.16,  0.17,
        0.18,  0.19,  0.2 ,  0.21,  0.22,  0.23,  0.24,  0.25,  0.26,
        0.27,  0.28,  0.29,  0.3 ,  0.31,  0.32,  0.33,  0.34,  0.35,
        0.36,  0.37,  0.38,  0.39,  0.4 ,  0.41,  0.42,  0.43,  0.44,
        0.45,  0.46,  0.47,  0.48,  0.49,  0.5 ,  0.51,  0.52,  0.53,
        0.54,  0.55,  0.56,  0.57,  0.58,  0.59,  0.6 ,  0.61,  0.62,
        0.63,  0.64,  0.65,  0.66,  0.67,  0.68,  0.69,  0.7 ,  0.71,
        0.72,  0.73,  0.74,  0.75,  0.76,  0.77,  0.78,  0.79,  0.8 ,
        0.81,  0.82,  0.83,  0.84,  0.85,  0.86,  0.87,  0.88,  0.89,
        0.9 ,  0.91,  0.92,  0.93,  0.94,  0.95,  0.96,  0.97,  0.98,
        0.99,  1.  ])

In [47]:
x = np.random.random((3, 3))        # Create a 3x3 array of uniformly distributed random values between 0 and 1
x

array([[ 0.53439838,  0.88530088,  0.97544028],
       [ 0.43390544,  0.85547022,  0.11137654],
       [ 0.12824434,  0.90388234,  0.13597806]])

In [52]:
x = np.random.normal(0, 1, (3, 3))  # Create a 3x3 array of normally distributed random values with mean 0 and std 1
x

array([[-1.15411191,  0.97274585, -1.13182889],
       [ 0.79056889, -0.04849152,  1.71248007],
       [-1.15830708, -0.95361537,  0.44162563]])

In [49]:
x = np.random.randint(0, 10, (3, 3)) # Create a 3x3 array of random integers between 0 and 10
x

array([[5, 7, 6],
       [8, 7, 8],
       [0, 1, 3]])

These NumPy functions are very useful when you need to quickly generate an array of values that follow some rule. 

## ▪ NumPy Standard Data Types

Refer to https://docs.scipy.org/doc/numpy/user/basics.types.html

## ▪ NumPy Array Attributes

In [54]:
x = np.random.randint(0, 100, (3, 4))
print(x)
print(x.ndim, x.shape, x.size, x.dtype)

[[99  3 61 16]
 [18 46 77 83]
 [84 73 40 12]]
2 (3, 4) 12 int32


## ▪ Array Indexing & Slicing

In [55]:
x = np.random.randint(0, 100, 10)
x

array([48, 88, 62, 70, 71, 73, 94, 23, 65, 15])

In [56]:
x[0]

48

In [57]:
x[-1]

15

In [58]:
x[3:-3]

array([70, 71, 73, 94])

In [59]:
x[3:-3:2]

array([70, 73])

In [60]:
x[::-1]

array([15, 65, 23, 94, 73, 71, 70, 62, 88, 48])

In [61]:
x = np.random.randint(0, 100, (5, 5))
x

array([[40, 76, 83, 83, 71],
       [82,  3, 21, 55, 45],
       [86, 67, 28, 47, 97],
       [19, 91, 61, 84, 49],
       [ 3, 52, 59, 71, 67]])

In [62]:
x[1, 2]

21

In [66]:
x[:2, :4]

array([[40, 76, 83, 83],
       [82,  3, 21, 55]])

## ▪ Array Concatenation and Splitting

In [67]:
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
np.concatenate([x, y])                   # Concatenate one-dimensional arrays.

array([1, 2, 3, 4, 5, 6])

In [68]:
x = np.array([[1, 2, 3], [4, 5, 6]])
np.concatenate([x, x])                   # Concatenate two-dimensional arrays along the first axis (axis 0).

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [69]:
x = np.array([[1, 2, 3], [4, 5, 6]])     
y = np.concatenate([x, x], axis=1)       # Concatenate along the second axis (axis 1).
y

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [None]:
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
x1, x2, x3 = np.split(x, [3, 5])         # 3 and 5 are the split points.
print(x1, x2, x3)

Splitting is the opposite of concatenation.

## ▪ Computation on NumPy Arrays

In [None]:
x = np.array([1, 2, 3, 4, 5])
x + 5

In [None]:
y = [1, 2, 3, 4, 5]
y + 5

Note that primitive Python lists do not allow computation on lists.

In [92]:
x = np.array([1, 2, 3, 4, 5])
x ** 2         # x to the power of 2

array([ 1,  4,  9, 16, 25])

In [96]:
x = np.array([1, 2, 3, 4, 5])
-x
x = -x
x

array([-1, -2, -3, -4, -5])

In [97]:
x = np.array([-1, 2, -3, 4, -5])
np.abs(x)     # absolute value

array([1, 2, 3, 4, 5])

In [98]:
x = [1, 2, 3]
np.exp(x)      # exponential (= e^x)

array([  2.71828183,   7.3890561 ,  20.08553692])

In [None]:
x = [1, 2, 3]
np.power(3, x) # power (= 3^x)

In [100]:
x = [1, 2, 4, 10]
np.log(x)      # ln(x)

array([ 0.        ,  0.69314718,  1.38629436,  2.30258509])

In [99]:
x = [1, 2, 4, 10]
np.log2(x)     # log2(x)

array([ 0.        ,  1.        ,  2.        ,  3.32192809])

In [None]:
x = [1, 2, 4, 10]
np.log10(x)    # log10(x)

In [102]:
x = np.array([1, 2, 3])
y = np.array([1, 3, 5])
x + y

array([2, 5, 8])

In [103]:
x * y

array([ 1,  6, 15])

In [111]:
x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6], [7, 8]])
x = np.dot(x,y)
x

array([[19, 22],
       [43, 50]])

In [112]:
x

array([[19, 22],
       [43, 50]])

In [113]:
x

array([[19, 22],
       [43, 50]])

## ▪ Aggregations

In [None]:
x = np.random.rand(100)
print(x)
print(x.sum(), x.mean(), x.var(), x.std(), x.min(), x.max(), x.argmin(), x.argmax())

## ▪ Comparisons

In [70]:
x = np.array([1, 2, 3, 4, 5])
x

array([1, 2, 3, 4, 5])

In [71]:
x < 3          # Return an array of answers.

array([ True,  True, False, False, False], dtype=bool)

In [72]:
x == 3

array([False, False,  True, False, False], dtype=bool)

## ▪ Working with Boolean Arrays

In [None]:
x = np.random.randint(1, 10, [3, 3])
x

In [None]:
np.count_nonzero(x < 5)     # Count the number of True values, i.e., less than 5.

In [None]:
np.sum(x < 5)               # Interpret True as 1 and False as 0 and sum all. 

In [None]:
np.sum((x > 3) & (x < 8))   # Boolean operators can be used.

## ▪ Boolean Arrays as Masks 

Boolean arrays can be used as masks to select particular subsets of the data themselves.

In [None]:
x = np.random.randint(1, 10, [3, 3])
x

In [None]:
x < 5

In [None]:
x[x < 5]     # Select the subset of x that meets the condition.

## ▪ Fancy Indexing

Fancy indexing in NumPy means passing an array of indices to access multiple array elements at once.

In [114]:
x = np.random.randint(100, size=10)
x

array([54,  8, 84, 39, 92, 31, 59, 33, 40, 28])

In [115]:
[x[1], x[3], x[5]]

[8, 39, 31]

In [116]:
ind = [1, 3, 5]
x[ind]

array([ 8, 39, 31])

## ▪ Sorting NumPy Arrays

In [75]:
x = np.random.choice(10, 5, replace=False)
x

array([9, 0, 1, 3, 4])

In [76]:
np.sort(x)

array([0, 1, 3, 4, 9])

In [77]:
x               # x hasn't changed.

array([9, 0, 1, 3, 4])

In [78]:
x.sort()
x               # x has changed.

array([0, 1, 3, 4, 9])

In [79]:
x = np.random.choice(10, 5, replace=False)
x

array([9, 0, 7, 2, 5])

In [80]:
np.sort(x)

array([0, 2, 5, 7, 9])

In [82]:
np.argsort(x)   # Return the indices of the sorted elements, instead of the elements.

array([1, 3, 4, 2, 0], dtype=int64)

## ▪ NumPy's Structured Arrays

In [83]:
name = ['Alice', 'Bob', 'Cathy', 'Mike']
age = [25, 45, 37, 19]
weight = [55, 85, 49, 80]

There's nothing here that tells us that the three arrays are related; it would be more natural if we could use a single structure to store all of this data. NumPy can handle this through structured arrays, which are arrays with compound data types.

In [84]:
data = np.zeros(4,                                        # Create a NumPy array with four entreis being all zoeros,
                dtype={'names':('name', 'age', 'weight'), # such that the columns of an entry are name, age, and weight
                       'formats':('U10', 'i4', 'f8')})    # ant their formats are U10, i4, and f8, repectively.
data

array([('', 0, 0.0), ('', 0, 0.0), ('', 0, 0.0), ('', 0, 0.0)], 
      dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

Here 'U10' translates to "Unicode string of maximum length 10," 'i4' translates to "4-byte (i.e., 32 bit) integer," and 'f8' translates to "8-byte (i.e., 64 bit) float."

In [85]:
data.dtype

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [86]:
data['name'] = name          # Fill the 'name' column with the list 'name' above.
data['age'] = age            # Fill the 'age' column with the list 'age' above.
data['weight'] = weight      # Fill the 'weight' column with the list 'weight' above.

print("\t", "name", "\t", "age", "\t", "weight")
print("---------------------------------")
for i in range(len(data)):
    print(i, "\t", data[i]["name"], "\t", data[i]["age"], "\t", data[i]["weight"])

	 name 	 age 	 weight
---------------------------------
0 	 Alice 	 25 	 55.0
1 	 Bob 	 45 	 85.0
2 	 Cathy 	 37 	 49.0
3 	 Mike 	 19 	 80.0


In [87]:
data["name"]

array(['Alice', 'Bob', 'Cathy', 'Mike'], 
      dtype='<U10')

In [120]:
data[2][age]

IndexError: invalid index

In [89]:
data[0]["name"]

'Alice'

In [90]:
data["name"][0]

'Alice'

In [91]:
data[data["age"] < 30]["name"]     # Use Boolean masking for filtering.

array(['Alice', 'Mike'], 
      dtype='<U10')