# import numpy as np

## Workshop: NumPy and Data Representation

NumPy Provides
  1. An array object of arbitrary homogeneous items
  2. Fast mathematical operations over arrays
  3. Linear Algebra, Fourier Transforms, Random Number Generation

### Cheatsheet

https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf

More cheatsheets:
https://www.datacamp.com/community/data-science-cheatsheets?page=3

### References

1. https://docs.scipy.org/doc/numpy-dev/user/basics.types.html
2. Python Data Science Handbook by Jake VanderPlas
3. Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython by Wes McKinney

### Dataset

Source: data.gov.sg

Dataset: Exchange Rates, SGD per unit of USD

In [153]:
from IPython.display import IFrame

IFrame('https://data.gov.sg/dataset/exchange-rates-sgd-per-unit-of-usd-average-for-period-annual/resource/f927c39b-3b44-492e-8b54-174e775e0d98/view/43207b9f-1554-4afb-98fe-80dfdd6bb4f6', width=600, height=400)

1. Go to https://data.gov.sg/dataset/exchange-rates-sgd-per-unit-of-usd-average-for-period-annual
2. Click on the `Download` button
3. Unzip and extract the `.csv` file. Note the path for use below.

### Import the package

In [None]:
import numpy as np

In [None]:
np?

### Basic Data Structures

![scalar vector matrix tensor](assets/numpy/scalar-vector-matrix-tensor.png)

(image: https://hadrienj.github.io/posts/Deep-Learning-Book-Series-2.1-Scalars-Vectors-Matrices-and-Tensors/)

### Scalar

In [None]:
x = np.array([1, 2, 3, 4])
x[0]

### Vector

In [None]:
x

In [None]:
x.shape

In [None]:
len(x)

### Matrix

In [None]:
A = np.zeros((3, 4))
A

In [None]:
A.shape

### Tensor (dimensions $\geq$ 3)

In [None]:
X = np.ones((2, 2, 6))
X

In [None]:
X.shape

### Rank

In [None]:
s = np.array(4)
print('A scalar is of rank {}'.format(s.ndim))

print("")
x = np.linspace(0, 1, 10) # 10 equally-spaced values between 0 and 1
print('A vector is of rank {}'.format(x.ndim))
print(x)

print("")
A = np.random.random((3, 1)) # random numbers
print('A matrix is of rank {}'.format(A.ndim))
print(A)

print("")
X = np.random.random((3, 1, 4))
print('Tensor X is of rank {}'.format(X.ndim))
print(X)

print("")
Y = np.empty((3, 1, 4, 2)) # unitialized (different from random)
print('Tensor Y is of rank {}'.format(Y.ndim))
print(Y)

### Data Structure Manipulation

In [None]:
# Array documentation
from numpy import doc

# Array types and conversions, scalars
doc.basics?

In [None]:
# Array indexing and slicing
doc.indexing?

### Indexing

In [None]:
A = np.random.random((3, 2))
A

In [None]:
A[0] # 1st row

In [None]:
A[1][0] # 2nd row, 1st column

In [None]:
A[-2][-1] # second-last row, last column

In [None]:
A[-4] # out of bounds access

In [None]:
A[A>0.5] # boolean indexing

Exercise: Try the above with a vector, and a Tensor

### Subsetting

A[index, ...]

index can be `:` for the axis (e.g. A[1,:])

In [132]:
A

array([[ 0.1819358 ,  0.07922454],
       [ 0.95426202,  0.34311652],
       [ 0.83740228,  0.24108659]])

In [None]:
A[1, 0] # 2nd row, 1st column

In [None]:
A[:,1] # 2nd column

In [None]:
A[0,:] # 1st row

### Slicing

data[start : stop : stepsize]

In [133]:
R = np.linspace(1, 10, 24).reshape(4, 3, 2)
R

array([[[  1.        ,   1.39130435],
        [  1.7826087 ,   2.17391304],
        [  2.56521739,   2.95652174]],

       [[  3.34782609,   3.73913043],
        [  4.13043478,   4.52173913],
        [  4.91304348,   5.30434783]],

       [[  5.69565217,   6.08695652],
        [  6.47826087,   6.86956522],
        [  7.26086957,   7.65217391]],

       [[  8.04347826,   8.43478261],
        [  8.82608696,   9.2173913 ],
        [  9.60869565,  10.        ]]])

In [None]:
R[1:2:1] # 2nd row along axis=0 

In [None]:
R[:, 1:2:1,] # 2nd row along axis=1

In [None]:
R[:, :, 1:2:1] # 2nd row along axis=2 

In [134]:
R[::2,] # every other row along axis=0

array([[[ 1.        ,  1.39130435],
        [ 1.7826087 ,  2.17391304],
        [ 2.56521739,  2.95652174]],

       [[ 5.69565217,  6.08695652],
        [ 6.47826087,  6.86956522],
        [ 7.26086957,  7.65217391]]])

### Slices are views, not copies

Changes the main array

In [147]:
R2 = R[0][1:2:1] # same as R[0, 1:2:1]
R2

array([[ 1.7826087 ,  2.17391304]])

In [150]:
R2[0] = 0
R2

array([[ 0.,  0.]])

In [151]:
R

array([[[  1.        ,   1.39130435],
        [  0.        ,   0.        ],
        [  2.56521739,   2.95652174]],

       [[  3.34782609,   3.73913043],
        [  4.13043478,   4.52173913],
        [  4.91304348,   5.30434783]],

       [[  5.69565217,   6.08695652],
        [  6.47826087,   6.86956522],
        [  7.26086957,   7.65217391]],

       [[  8.04347826,   8.43478261],
        [  8.82608696,   9.2173913 ],
        [  9.60869565,  10.        ]]])

### Transposing

In [None]:
R.T

In [None]:
R.T.shape

### Sorting

# import pandas as pd

## Workshop: Pandas and Data Transformation

# import matplotlib.pyplot as plt

## Workshop: Matplotlib and Data Visualization

# Putting everything together

## Workshop: Data Workflow

## Assessment 1: Data Workflow