Introduction to Numpy
==================

In [1]:
import numpy as np
np.__version__

'1.18.1'

Numpy Arrays
-----------------

np arrays differ from Python lists as follows:
- they are immutable (cannot add/remove, change length, but can change
  what reference a particular element points to)
- all elements must be of the same [type](https://numpy.org/devdocs/user/basics.types.html)
  (when creating a numpy array from a list, numpy will coerce the
  elements into a single type (e.g., a mix of floats and ints will 
  be coerced into floats)

In [2]:
gpas_as_list = [4.0, 3.286, 3.5]

# Can append element to list
gpas_as_list.append(4.0)

# Can have multiple types
gpas_as_list.insert(1, 'forget')

# Can remove items
gpas_as_list.pop(1)

'forget'

In [3]:
gpas_as_list

[4.0, 3.286, 3.5, 4.0]

In [4]:
gpas = np.array(gpas_as_list)

*Some common properties on np arrays:*

In [5]:
print(gpas.dtype)
print(gpas.itemsize)  # in bytes
print(gpas.size)  # number of elements
print(gpas.nbytes)  # number of bytes

float64
8
4
32


In [6]:
# create a 100-element array with all elements set to 0.0
study_minutes_floats = np.zeros(100)

# given we only want to track +ve whole minutes, and there are only 1440
# minutes in a day, we can use UINT16 as a more efficient type than
# Float64
study_minutes = np.zeros(100, np.uint16)

In [7]:
%whos

Variable               Type       Data/Info
-------------------------------------------
gpas                   ndarray    4: 4 elems, type `float64`, 32 bytes
gpas_as_list           list       n=4
np                     module     <module 'numpy' from '/Us<...>kages/numpy/__init__.py'>
study_minutes          ndarray    100: 100 elems, type `uint16`, 200 bytes
study_minutes_floats   ndarray    100: 100 elems, type `float64`, 800 bytes


In [8]:
study_minutes[0] = 150
first_day_minutes = study_minutes[0]
print(first_day_minutes)
print(type(first_day_minutes))

150
<class 'numpy.uint16'>


In [9]:
study_minutes[1] = 60
print(study_minutes)

[150  60   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [10]:
# update multiple elements at once
study_minutes[2:6] = [80, 60, 30, 90]
print(study_minutes)

[150  60  80  60  30  90   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [11]:
# the numpy array datatype is "ndarray" for n-dimensional array
# var[y][x] where y is the row and x is the column
#
# Creating a 2D (matrix) array:
students_gpas = np.array([
    [4.0, 3.286, 3.5, 4.0],
    [3.2, 3.8, 4.0, 4.0],
    [3.96, 3.92, 4.0, 4.0],
], np.float16)
students_gpas

array([[4.   , 3.285, 3.5  , 4.   ],
       [3.2  , 3.8  , 4.   , 4.   ],
       [3.96 , 3.92 , 4.   , 4.   ]], dtype=float16)

In [12]:
students_gpas.ndim  # number of dimensions

2

In [13]:
students_gpas.shape  # (rows, columns)

(3, 4)

In [14]:
# note .size will return the total number of elements
# len() will return the length of a row
%whos ndarray

Variable               Type       Data/Info
-------------------------------------------
gpas                   ndarray    4: 4 elems, type `float64`, 32 bytes
students_gpas          ndarray    3x4: 12 elems, type `float16`, 24 bytes
study_minutes          ndarray    100: 100 elems, type `uint16`, 200 bytes
study_minutes_floats   ndarray    100: 100 elems, type `float64`, 800 bytes


In [15]:
np.info(students_gpas)

class:  ndarray
shape:  (3, 4)
strides:  (8, 2)
itemsize:  2
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x7fe4de608270
byteorder:  little
byteswap:  False
type: float16


In [16]:
# Create a 2D version of the study_minutes array
# note: as we are instantiating this from an existing ndarray it is
# instantiated with the pre-existing datatype
study_minutes = np.array([
    study_minutes,
    np.zeros(100, np.uint16)
])

In [17]:
study_minutes.shape

(2, 100)

In [18]:
# set round 2 day 1 to 60
study_minutes[1][0] = 60

In [19]:
# numpy arrays have an alternative bracket-comma notation for accessing
# the contents of n-dimensional arrays:
study_minutes[1, 0]  # equivalent to [1][0]

60

In [20]:
rand = np.random.RandomState(42)
fake_log = rand.randint(30, 180, size=100, dtype=np.uint16)  # minval, maxval

In [21]:
fake_log

array([132, 122, 128,  44, 136, 129, 101,  95,  50, 132, 151,  64, 104,
       175, 117, 146, 139, 129, 133, 176,  98, 160, 179,  99,  82, 142,
        31, 106, 117,  56,  98,  67, 121, 159,  81, 170,  31,  50,  49,
        87, 179,  51, 116, 177, 118,  78, 171, 117,  88, 123, 102,  44,
        79,  31, 108,  80,  59, 137,  84,  93, 155, 160,  67,  80, 166,
       164,  70,  50, 102, 113,  47, 131, 161, 118,  82,  89,  81,  43,
        81,  38, 119,  52,  82,  31, 159,  57, 113,  71, 121, 140,  91,
        70,  37, 106,  64, 127, 110,  58,  93,  79], dtype=uint16)

In [22]:
fake_log[[3, 8]]

array([44, 50], dtype=uint16)

In [23]:
index = np.array([
    [3, 8],
    [0, 1],
])
fake_log[index]

array([[ 44,  50],
       [132, 122]], dtype=uint16)

In [24]:
# np.append() takes a numpy array and some new data and a target axis and
# returns a new numpy array
# Note that the rank (# dimensions) of the appended data must match the
# rank of the original array (making this method more like extend than
# append). Here we are putting fake_log inside a list to get a 2D array
study_minutes = np.append(study_minutes, [fake_log], axis=0)

In [25]:
study_minutes[1,1] = 360

In [26]:
study_minutes

array([[150,  60,  80,  60,  30,  90,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 60, 360,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [27]:
mask = fake_log < 60  # check each value in fake_log with the specified comparison operator

In [28]:
# use the boolean array from the previous step as a mask to select only
# the values inside fake_log which were marked True
fake_log[mask]

array([44, 50, 31, 56, 31, 50, 49, 51, 44, 31, 59, 50, 47, 43, 38, 52, 31,
       57, 37, 58], dtype=uint16)

In [29]:
# To make a boolean array using multiple conditions, we need to use
# bitwise operators which have been overloaded in numpy as
# elementwise operators:
two_way_mask = (study_minutes > 0) & (study_minutes < 60)

In [30]:
two_way_mask

array([[False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, Fals

In [31]:
study_minutes[two_way_mask]

array([30, 44, 50, 31, 56, 31, 50, 49, 51, 44, 31, 59, 50, 47, 43, 38, 52,
       31, 57, 37, 58], dtype=uint16)

### Elementwise operators ###

- `&`: AND
- `|`: OR
- `^`: XOR
- `~`: NOT (unary operator)

In [32]:
np.array([False, True, True]) & np.array([False, False, True])

array([False, False,  True])

In [33]:
np.array([False, True, True]) | np.array([False, False, True])

array([False,  True,  True])

In [34]:
np.array([False, True, True]) ^ np.array([False, False, True])

array([False,  True, False])

In [35]:
~np.array([False, True, True])

array([ True, False, False])

In [36]:
# Update values in an array with a boolean mask:
study_minutes[study_minutes < 60] = 0

In [37]:
study_minutes

array([[150,  60,  80,  60,   0,  90,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 60, 360,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [38]:
fruit = ["apple", "banana", "cherry", "durian"]

In [39]:
fruit[1:3]

['banana', 'cherry']

In [40]:
fruit[::2]

['apple', 'cherry']

In [41]:
# arange is like python range but returns a numpy ndarray instead of
# a python Range object
np.arange(20)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [42]:
practice = np.arange(42)
practice

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41])

In [43]:
# break the range up into 7 rows of 6 items
# NOTE: the number of elements in the 2D array must still match exactly
# the number of elements in the original array
practice.shape = (7, 6)
practice

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35],
       [36, 37, 38, 39, 40, 41]])

In [47]:
# We can use the last component of the tuple to specify a column
practice[:,3]

array([ 3,  9, 15, 21, 27, 33, 39])

In [49]:
# Or use a slice to specify multiple columns
practice[:,2:4]

array([[ 2,  3],
       [ 8,  9],
       [14, 15],
       [20, 21],
       [26, 27],
       [32, 33],
       [38, 39]])

In [54]:
# WARNING: 'slices' in numpy DO NOT return a copy of the data, they return
# a 'view' of the data. Thus by manipulating the 'slice' you are actually
# manipulating the underlying data.
#
# It's not always obvious at a glance whether we are seeing a true ndarray
# or a view. We can check by examining the `base` property. This is 
# None for ndarray and the base array for a view
print(practice.base)  # None
v = practice[:, 2:4]
print(v.base)  # the repr of v's base property (the base array)
print( type(v.base) )  # the type of the base array

print( v.base is practice )  # True

None
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]
 [36 37 38 39 40 41]]
<class 'numpy.ndarray'>
True


In [55]:
alternate_view = practice.reshape(3, 14)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
       [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]])

In [56]:
# we can use -1 to let the reshape method infer one of the dimensions
practice.reshape(-1, 2)

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15],
       [16, 17],
       [18, 19],
       [20, 21],
       [22, 23],
       [24, 25],
       [26, 27],
       [28, 29],
       [30, 31],
       [32, 33],
       [34, 35],
       [36, 37],
       [38, 39],
       [40, 41]])

In [64]:
# flattened view
rav = practice.ravel()
print(rav)
print(rav.base)  # rav is a view

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]
 [36 37 38 39 40 41]]


In [65]:
# flattened copy
flat = practice.flatten()
print(flat)
print(flat.base)  # flat is a new array

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]
None


In [66]:
np.lookfor("flat")

Search results for 'flat'
-------------------------
numpy.diagflat
    Create a two-dimensional array with the flattened input as a diagonal.
numpy.flatiter
    Flat iterator object to iterate over arrays.
numpy.put
    Replaces specified elements of an array with given values.
numpy.flatnonzero
    Return indices that are non-zero in the flattened version of a.
numpy.ravel
    Return a contiguous flattened array.
numpy.ma.diagflat
    Create a two-dimensional array with the flattened input as a diagonal.
numpy.unravel_index
    Converts a flat index or array of flat indices into a tuple
numpy.matrix.flatten
    Return a flattened copy of the matrix.
numpy.ma.flatten_mask
    Returns a completely flattened version of the mask, where nested fields
numpy.chararray.flatten
    Return a copy of the array collapsed into one dimension.
numpy.chararray.put
    Set ``a.flat[n] = values[n]`` for all `n` in indices.
numpy.ravel_multi_index
    Converts a tuple of index arrays into an array of fl

In [67]:
np.ravel?

In [69]:
transp = practice.T  # transpose
print(transp)
print(transp.base)

[[ 0  6 12 18 24 30 36]
 [ 1  7 13 19 25 31 37]
 [ 2  8 14 20 26 32 38]
 [ 3  9 15 21 27 33 39]
 [ 4 10 16 22 28 34 40]
 [ 5 11 17 23 29 35 41]]
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]
 [36 37 38 39 40 41]]


In [70]:
t2 = transp.copy()
print(t2)
print(t2.base)

Numpy Array Programming
--------------------------------


In [75]:
# Consider the following:
#
# 2T + 0B + 0H + 0M = 3
# 4T + 1B + 2H + 2M = 20.5
# 0T + 1B + 0H + 1M = 10
# 6T + 0B + 1H + 2M = 14.25
#
# We can turn the left side of the equations into a matrix and the
# right side into a vector and solve
orders = np.array([
    [2, 0, 0, 0],
    [4, 1, 2, 2],
    [0, 1, 0, 1],
    [6, 0, 1, 2],
])
totals = np.array([3, 20.50, 10, 14.25])
prices = np.linalg.solve(orders, totals)
print(prices)

[1.5  8.   1.25 2.  ]


In [77]:
# The following are equivalent (multiply each element in the matrix by
# its corresponding value in the vector):
print( orders.dot(prices) )
print( orders @ prices )

[ 3.   20.5  10.   14.25]
[ 3.   20.5  10.   14.25]


In [78]:
# Numpy has a number of `ufunc` functions, which take a vector and apply
# a formula to each value in the vector
a, b = np.split( np.arange(1, 11), 2)
a, b

(array([1, 2, 3, 4, 5]), array([ 6,  7,  8,  9, 10]))

In [80]:
# regular operators are overloaded for arrays
# note both operands need to be the correct shape
print( a + b )
print( a + 2 )  # spreading a scalar across a vector is known as 'broadcasting'

[ 7  9 11 13 15]
[3 4 5 6 7]


In [81]:
x1 = np.arange(9.0).reshape((3, 3))  # create a 3x3 array from 0...8.0
x2 = np.arange(3.0)  # create a vector from 0 to 2.0
np.add(x1, x2)  # equivalent to x1 + x2

array([[ 0.,  2.,  4.],
       [ 3.,  5.,  7.],
       [ 6.,  8., 10.]])

In [85]:
print( students_gpas.mean() )  # all students
print( students_gpas.mean(axis=1) )  # average for each student
print( students_gpas.mean(axis=0) )  # average of all students for each year

3.805
[3.695 3.75  3.97 ]
[3.72  3.668 3.834 4.   ]
