# Introduction to NumPy

Lots of these notes are from Chapter 2 of VanderPlas -- you should definitely go through his chapter carefully.  I will use the book notes and these in class.

The NumPy documentation: https://numpy.org/doc/stable

In [3]:
# import the NumPy module and display the version number
import numpy as np
np.__version__

'1.20.3'

In [4]:
# Build-in documentation
# np.<TAB> in IPython
np?

In [5]:
# see the things in the np namespace
dir(np)

['ALLOW_THREADS',
 'AxisError',
 'BUFSIZE',
 'Bytes0',
 'CLIP',
 'DataSource',
 'Datetime64',
 'ERR_CALL',
 'ERR_DEFAULT',
 'ERR_IGNORE',
 'ERR_LOG',
 'ERR_PRINT',
 'ERR_RAISE',
 'ERR_WARN',
 'FLOATING_POINT_SUPPORT',
 'FPE_DIVIDEBYZERO',
 'FPE_INVALID',
 'FPE_OVERFLOW',
 'FPE_UNDERFLOW',
 'False_',
 'Inf',
 'Infinity',
 'MAXDIMS',
 'MAY_SHARE_BOUNDS',
 'MAY_SHARE_EXACT',
 'MachAr',
 'NAN',
 'NINF',
 'NZERO',
 'NaN',
 'PINF',
 'PZERO',
 'RAISE',
 'SHIFT_DIVIDEBYZERO',
 'SHIFT_INVALID',
 'SHIFT_OVERFLOW',
 'SHIFT_UNDERFLOW',
 'ScalarType',
 'Str0',
 'Tester',
 'TooHardError',
 'True_',
 'UFUNC_BUFSIZE_DEFAULT',
 'UFUNC_PYVALS_NAME',
 'Uint64',
 'WRAP',
 '_NoValue',
 '_UFUNC_API',
 '__NUMPY_SETUP__',
 '__all__',
 '__builtins__',
 '__cached__',
 '__config__',
 '__deprecated_attrs__',
 '__dir__',
 '__doc__',
 '__expired_functions__',
 '__file__',
 '__getattr__',
 '__git_revision__',
 '__loader__',
 '__mkl_version__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '

In [6]:
# Create a function to display the array attributes and data (optinally).  
#  Note the use of the default value for the show_data parameter.  Made
#  this part of the display optional (and off by default) to support large
#  arrays (that you wouldn't want to try to show explicitly)
def show(the_array, show_data = 0):
    print("  Dimension: {:}".format(the_array.ndim))
    print("       Size: {:}".format(the_array.size))
    print("      Shape: {:}".format(the_array.shape))    
    print("  Data Type: {:}".format(the_array.dtype))    
    print("  Item Size: {:}".format(the_array.itemsize))
    print("Data Buffer: {:}".format(the_array.data))
    if show_data:
        print("The data:")
        print(the_array)

print("Example function call:")
show(np.array([1, 2, 3, 4, 5]),1)

Example function call:
  Dimension: 1
       Size: 5
      Shape: (5,)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE60461C0>
The data:
[1 2 3 4 5]


## NumPy Arrays
### Creating Arrays and Accessing Elements

In [7]:
# Create a NumPy array from a list
a = np.array([1.5, 0.7, 22.4])
show(a, 1)

  Dimension: 1
       Size: 3
      Shape: (3,)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE60461C0>
The data:
[ 1.5  0.7 22.4]


In [8]:
# Create a NumPy array from a list and specify the data type
a = np.array([1.5, 0.7, 22.4], dtype="int32")
show(a, 1)

  Dimension: 1
       Size: 3
      Shape: (3,)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE6495400>
The data:
[ 1  0 22]


In [9]:
# The second element (zero-based, so it's index 1)
a[1]

0

In [10]:
# Multi-dimensional array.  Note that NumPy arrays are homogenous -- all
#   elements are the same data type (dtype) -- integers in this case
b = np.array(
    [ [1,  2,  3,  4,   5]
     ,[6,  7,  8,  9,  10]
     ,[11, 12, 13, 14, 15] ] )
show(b, 1)

  Dimension: 2
       Size: 15
      Shape: (3, 5)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE60316C0>
The data:
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]]


In [11]:
# the (i, j)the element
i = 2
j = 3
b[i, j]

14

In [12]:
# Note that if one elements is a float, all will be upcast since
# NumPy arrays are homogeneous
b = np.array(
    [ [1.0,  2,  3,  4,   5]
     ,[6,  7,  8,  9,  10]
     ,[11, 12, 13, 14, 15] ] )
show(b,1)
# since Python lists are heterogeneous (in data types), there
# is no upcasting and a similar definition would have a list of lists
# with 1 float and the rest integers.

  Dimension: 2
       Size: 15
      Shape: (3, 5)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE60316C0>
The data:
[[ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]
 [11. 12. 13. 14. 15.]]


In [13]:
# zeros function to create and array and initialize with zeros. Ones() works similarly.
# Note that you can also specify the data type.  Also note that with some
# datatype (e.g. float32), you seem to need to quote the type.
c = np.zeros((3, 8), dtype=int)
show(c, 1)

  Dimension: 2
       Size: 24
      Shape: (3, 8)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE60316C0>
The data:
[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [14]:
# full() for arbitrary values
c = np.full((3, 3, 3), 82.6)
show(c, 1)

  Dimension: 3
       Size: 27
      Shape: (3, 3, 3)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE64564F0>
The data:
[[[82.6 82.6 82.6]
  [82.6 82.6 82.6]
  [82.6 82.6 82.6]]

 [[82.6 82.6 82.6]
  [82.6 82.6 82.6]
  [82.6 82.6 82.6]]

 [[82.6 82.6 82.6]
  [82.6 82.6 82.6]
  [82.6 82.6 82.6]]]


In [15]:
# Create a NumPy array and initialize it with integers from 0 - 14 using
#   the arange() function.
a = np.arange(15)
show(a, 1)

  Dimension: 1
       Size: 15
      Shape: (15,)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE649D880>
The data:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


In [16]:
# Create the same array, but used floats and then reshape it to a 3x5 matrix (a 
#   2-dimensional array, technically)
a = np.arange(15, dtype=float).reshape(3,5)
show(a, 1)

  Dimension: 2
       Size: 15
      Shape: (3, 5)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE60316C0>
The data:
[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14.]]


In [17]:
# Create a 20-element array of floats form 0 - 19 and reshape it to a 5x4 array.
# Note that we create an anonymous array and send that to
# the show() function.  The structure will be garbage-collected
# after the function call since it is anonymous.
show(np.arange(20.0).reshape(5,4), 1)

  Dimension: 2
       Size: 20
      Shape: (5, 4)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE6414BA0>
The data:
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]
 [16. 17. 18. 19.]]


In [18]:
# Use a comprehension - From VanderPlas
# Nested lists result in multi-dimensional arrays
n = np.array([range(i, i + 3) for i in [2, 4, 6]])
show(n, 1)

  Dimension: 2
       Size: 9
      Shape: (3, 3)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE6414EE0>
The data:
[[2 3 4]
 [4 5 6]
 [6 7 8]]


In [19]:
# Show the details of what's going in the above assignment statement.
# First the inner comprehension
t = [range(i, i + 3) for i in [2, 4, 6]]
t

[range(2, 5), range(4, 7), range(6, 9)]

In [20]:
# Next create the NumPy array using the list created
# by the comprehension
np.array(t)

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

In [21]:
# Create a 3x3 array of normally distributed random values
# with mean 96 and standard deviation 14
c = np.random.normal(96, 14, (3, 3))
show(c,1)

  Dimension: 2
       Size: 9
      Shape: (3, 3)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE649E2B0>
The data:
[[ 90.95784023  84.24905988 106.41821095]
 [103.13433559 102.38978584  91.11563891]
 [120.65386528 104.09443015 100.78928894]]


In [22]:
# Some more samples from VanderPlas
np.random.seed(0)  # seed for reproducibility

x1 = np.random.randint(10, size=6)  # One-dimensional array
x2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
x3 = np.random.randint(10, size=(3, 4, 5))  # Three-dimensional array

In [23]:
# Iterating through a Python list of NumPy arrays.
al = [x1, x2, x3]
for a in al:
    show(a, 1)
    print("")

  Dimension: 1
       Size: 6
      Shape: (6,)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE649D940>
The data:
[5 0 3 3 7 9]

  Dimension: 2
       Size: 12
      Shape: (3, 4)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE6414EE0>
The data:
[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]

  Dimension: 3
       Size: 60
      Shape: (3, 4, 5)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE6456400>
The data:
[[[8 1 5 9 8]
  [9 4 3 0 3]
  [5 0 2 3 8]
  [1 3 3 3 7]]

 [[0 1 9 9 0]
  [4 7 3 2 7]
  [2 0 0 4 5]
  [5 6 8 4 1]]

 [[4 9 8 1 1]
  [7 9 9 3 6]
  [7 2 0 3 5]
  [9 4 4 6 4]]]



In [26]:
# Element (1, 2, 3) from x3
x3[1, 2, 3]

4

In [27]:
# flatten a multi-dimensional array
show(x3.flatten(), 1)

  Dimension: 1
       Size: 60
      Shape: (60,)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE649DD00>
The data:
[8 1 5 9 8 9 4 3 0 3 5 0 2 3 8 1 3 3 3 7 0 1 9 9 0 4 7 3 2 7 2 0 0 4 5 5 6
 8 4 1 4 9 8 1 1 7 9 9 3 6 7 2 0 3 5 9 4 4 6 4]


In [28]:
# NumPy arrays are mutable.
x2[0,0] = 12
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [29]:
# Note the behavior when we try to assign a different data type ...
x2[1, 1] = 7.325
show(x2,1)

  Dimension: 2
       Size: 12
      Shape: (3, 4)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE6414EE0>
The data:
[[12  5  2  4]
 [ 7  7  8  8]
 [ 1  6  7  7]]


### More on reshaping and np.newaxis

The concept of "axes" is one of the more confusing aspects of NumPy.  Axes are used throughout the NumPy-based ecosystem -- notebly for us Pandas and Matplotlib, so you need to know them.  Lots and lots of online material.  I found this one particularly easy to understand - https://www.sharpsightlabs.com/blog/numpy-axes-explained/

In [30]:
x = np.array([1, 2, 3])
x

array([1, 2, 3])

In [31]:
# What is the difference between x and y?
y = x.reshape((1,3))
y

array([[1, 2, 3]])

In [32]:
# What is the difference between y and z?
z = x.reshape((3,1))
z

array([[1],
       [2],
       [3]])

In [33]:
x.shape, y.shape,z.shape

((3,), (1, 3), (3, 1))

In [34]:
y[0]

array([1, 2, 3])

In [35]:
x[np.newaxis, :]

array([[1, 2, 3]])

In [36]:
x[:,np.newaxis]

array([[1],
       [2],
       [3]])

In [37]:
a = np.arange(27).reshape((3,3,3))
a

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [38]:
a = np.arange(81).reshape((3,3,3,3))
a

array([[[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]],


       [[[27, 28, 29],
         [30, 31, 32],
         [33, 34, 35]],

        [[36, 37, 38],
         [39, 40, 41],
         [42, 43, 44]],

        [[45, 46, 47],
         [48, 49, 50],
         [51, 52, 53]]],


       [[[54, 55, 56],
         [57, 58, 59],
         [60, 61, 62]],

        [[63, 64, 65],
         [66, 67, 68],
         [69, 70, 71]],

        [[72, 73, 74],
         [75, 76, 77],
         [78, 79, 80]]]])

 ### Slices - Views and copies
 
 Slice: [i:j:k] - start:stop:stride (for each axis)

In [39]:
# Show the 2D array x2
x2

array([[12,  5,  2,  4],
       [ 7,  7,  8,  8],
       [ 1,  6,  7,  7]])

In [40]:
# slice example - upper-left 2x3 sub-matrix
# :2 - rows 0, 1
# :3 - columns 0, 1, 2
x2[:2, :3]

array([[12,  5,  2],
       [ 7,  7,  8]])

In [41]:
# lower right 2x3
x2[-2:, -3:]

array([[7, 8, 8],
       [6, 7, 7]])

In [42]:
# middle 1x2
x2[1:-1, 1:-1]

array([[7, 8]])

In [43]:
# note that slices are (by default) views of the array, not copies.
x = x2[:2, :3]
x, x2

(array([[12,  5,  2],
        [ 7,  7,  8]]),
 array([[12,  5,  2,  4],
        [ 7,  7,  8,  8],
        [ 1,  6,  7,  7]]))

In [44]:
x[0, 0] = 477
x, x2

(array([[477,   5,   2],
        [  7,   7,   8]]),
 array([[477,   5,   2,   4],
        [  7,   7,   8,   8],
        [  1,   6,   7,   7]]))

In [45]:
# If you want a copy, rather than a view, use the copy() function.
x = x2[:2, :3].copy()
x[0, 0] = 976
x, x2

(array([[976,   5,   2],
        [  7,   7,   8]]),
 array([[477,   5,   2,   4],
        [  7,   7,   8,   8],
        [  1,   6,   7,   7]]))

## Array Concatenation and Splitting - VP 02.02

## Universal Functions and the Slowness of Loops - VP 02.03

## Axes and Aggregate Functions

Also check notebook 02.04 - Aggregations: Min, Max, and Everything Inbetween from Vanderplas

In [46]:
a = np.random.normal(5, 1, (5, 3))
show(a, 1)

  Dimension: 2
       Size: 15
      Shape: (5, 3)
  Data Type: float64
  Item Size: 8
Data Buffer: <memory at 0x0000022FE6414EE0>
The data:
[[6.25441407 6.41910204 4.25614392]
 [2.4825629  3.49290398 6.14907613]
 [3.80642175 6.14104245 6.50944508]
 [6.06777513 4.31341052 5.01487332]
 [4.6243341  4.96177636 5.36797447]]


In [47]:
# Overall sum -- all elements of the array
a.sum()

75.861256210341

In [48]:
# Sum along an axis -- What is axis 0?
a.sum(axis=0)

array([23.23550795, 25.32823535, 27.29751291])

In [49]:
# Sum along the other axis -- What is axis 1
a.sum(axis=1)

array([16.92966003, 12.124543  , 16.45690928, 15.39605897, 14.95408494])

In [50]:
# the average of the 3rd column
a.mean(axis=0)[2]

5.459502582996934

In [52]:
# or - we can sum along an axis and then divide by the
# number of elements along that axis.
ax = 0
a.sum(axis=ax)[2]/a.shape[ax]

5.459502582996934

In [53]:
# How did that work now?  Look at the components individually
a.sum(axis=ax)

array([23.23550795, 25.32823535, 27.29751291])

In [54]:
a.shape

(5, 3)

In [55]:
# average of the 4th row
a.mean(axis=1)[3]

5.132019656109631

In [56]:
# or
ax = 1
a.sum(axis=ax)[3]/a.shape[ax]

5.132019656109631

In [57]:
b = np.random.randint(1, 6, (3, 4, 6))
show(b, 1)

  Dimension: 3
       Size: 72
      Shape: (3, 4, 6)
  Data Type: int32
  Item Size: 4
Data Buffer: <memory at 0x0000022FE64564F0>
The data:
[[[4 3 2 3 4 4]
  [4 3 4 5 2 3]
  [4 2 3 2 5 3]
  [4 1 4 3 4 1]]

 [[1 1 4 3 4 1]
  [5 1 1 3 4 3]
  [4 1 1 1 4 1]
  [3 3 1 5 4 5]]

 [[1 5 4 4 5 2]
  [4 1 1 1 2 1]
  [5 2 4 2 1 1]
  [5 4 4 2 1 1]]]


In [58]:
# Mean of the "planes"
b.mean(axis=0)

array([[2.        , 3.        , 3.33333333, 3.33333333, 4.33333333,
        2.33333333],
       [4.33333333, 1.66666667, 2.        , 3.        , 2.66666667,
        2.33333333],
       [4.33333333, 1.66666667, 2.66666667, 1.66666667, 3.33333333,
        1.66666667],
       [4.        , 2.66666667, 3.        , 3.33333333, 3.        ,
        2.33333333]])

In [59]:
# Mean of the ?
b.mean(axis=1)

array([[4.  , 2.25, 3.25, 3.25, 3.75, 2.75],
       [3.25, 1.5 , 1.75, 3.  , 4.  , 2.5 ],
       [3.75, 3.  , 3.25, 2.25, 2.25, 1.25]])

In [None]:
# Mean of the ?
b.mean(axis=2)

## Example with A Dataset from a CSV file

In [None]:
# erv.csv data - a (100x15 matrix of floats)
erv = np.genfromtxt('..\data\erv.csv', delimiter=',')
show(erv)

In [None]:
# show a sample - upper left 5x5 (first five columns of the first 5 rows)
for r in erv[:5, :5]:
    print(r)

In [None]:
# Use the aggregate sum to find the column averages in one statement.
erv.mean(axis=0)

In [None]:
# row averages
erv.mean(axis=1)

In [None]:
# Load matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# Show a histogram of the jth column (j between 0 and 14)
j = 9
plt.hist(erv[:,j])
plt.show()

In [None]:
# show histograms of all 15 columns in a single plot
plt.figure(figsize=(20, 8))
for j in range(1, 16):
    plt.subplot(3,5,j)
    plt.hist(erv[:,j-1])
plt.show()

In [None]:
# or with color
c = ['orange', 'green', 'red', 'beige', 'brown'
    ,'dimgray', 'firebrick', 'darkkhaki', 'indigo', 'darksalmon'
    ,'forestgreen', 'fuchsia', 'darkcyan', 'darkviolet','darkgoldenrod'
    ]
plt.figure(figsize=(20, 8))
for j in range(15):
    plt.subplot(3,5,j+1)
    plt.hist(erv[:,j], color=c[j])
plt.show()

In [None]:
# Scatter plot of column col1 vs column col2
col1 = 0
col2 = 14
plt.scatter(erv[:, col1], erv[:, col2]);

In [None]:
plt.scatter?

## Monte Carlo Example - Video Module