# Numpy

## Create an array

In [3]:
import numpy as np
sims = np.array([[3,1,np.Inf,9],[5,9,1,7],[1,8,6,2], [1,5,8,0], [1,9,5,-1]])
print(sims)

[[ 3.  1. inf  9.]
 [ 5.  9.  1.  7.]
 [ 1.  8.  6.  2.]
 [ 1.  5.  8.  0.]
 [ 1.  9.  5. -1.]]


In [32]:
def generate_sample_numpy_array():
    return np.array([[3,1,np.Inf,9],[5,9,1,7],[1,8,6,2], [1,5,8,0], [1,9,5,-1]])

## Axis of array

```
By definition, the axis number of the dimension is the index of that dimension within the array's shape. It is also the position used to access that dimension during indexing.

If you do .sum(axis=n), for example, then dimension n is collapsed and deleted, with each value in the new matrix equal to the sum of the corresponding collapsed values. For example, if b has shape (5,6,7,8), and you do c = b.sum(axis=2), then axis 2 (dimension with size 7) is collapsed, and the result has shape (5,6,8). Furthermore, c[x,y,z] is equal to the sum of all elements b[x,y,:,z].
```

See [this post](https://stackoverflow.com/a/17079437/4667568).

In [4]:
# How many non-zero values are in this array?
print(np.count_nonzero(sims))

19


In [5]:
# find out the minimum value per row
# note that when axis = 1, the computation is along the columns (meaning per row)
np.min(sims, axis=1)

array([ 1.,  1.,  1.,  0., -1.])

In [7]:
# given a list of column numbers, find the column index of the minimum value per row
list_col = [3,2]
[list_col[i] for i in np.argmin(sims[:,list_col], axis=1).tolist()]

[3, 2, 3, 3, 3]

In [8]:
# replace the zero values in the array with -2222
new_val = -2222
np.where(sims == 0, new_val, sims)

array([[ 3.,  1., inf,  9.],
       [ 5.,  9.,  1.,  7.],
       [ 1.,  8.,  6.,  2.],
       [ 1.,  5.,  8., -2.],
       [ 1.,  9.,  5., -1.]])

In [9]:
# get the row and column index of the zero values in the array
print(np.where(sims == np.Inf))
# in the output, the first and second array is the row and column number

(array([0]), array([2]))


In [11]:
# in a given column, find out the index of rows that are smaller than a threshold
col_number = 1
threshold = 7
np.where(sims[:,col_number] < threshold)

(array([0, 3]),)

In [22]:
# slicing of the array: only slice on one axis in one command
## exmaple: select the rows in 'sims' corresponding to elements in x that are greater than 1 and less than 5.
x = np.array([5, 2, 3, 1, 4])
print(sims[(x > 1) & (x < 5)])

# how about adding another condition - selecting the second column?
# This one works!
print(sims[(x > 1) & (x < 5), 1])

# # this doesn't work
print(sims[(x > 1) & (x < 5), [0,1]])

[[ 5.  9.  1.  7.]
 [ 1.  8.  6.  2.]
 [ 1.  9.  5. -1.]]
[9. 8. 9.]


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (3,) (2,) 

## NumPy broadcasting

*Principle*: When accessing a numpy multi-dimensional array with other multi-dimensional arrays of integer type the arrays used for the indices need to have the same shape. For example, (3,)(3,) or (2,)(2,). If any of the array is of the shape (1,), this is okay.

Numpy will happily broadcast, if possible - but for that to be possible the arrays need to have the same dimensionality.

NumPy broadcasting aligns dimensions from right to left, not left to right.

Key message:
1. If you want to use one condition to filter the rows and another condition to filter the columns of an array, it is safe to write two commands - one command for filtering rows and the other for filtering columns.

In [31]:
# this doesn't work
try:
  print(sims[(x > 1) & (x < 5), [0,1]])
except IndexError:
  print("An IndexError occurred")

# this one works
print(sims[:, [0,1]])

# this one also works
print(sims[:, [1]])

# this works!
x = np.array([5, 2, 3, 1, 4])
xx = np.array([5, 2, 3, 1])
print(sims)
print(sims[(x > 1) & (x < 5), (xx > 0) & (xx < 5)])
# same as
print(sims[[1,2,4], [1,2,3]])

# this one also works, but the result is different from the above
print(sims[[1,2,4], [1,2,1]])

An IndexError occurred
[[3. 1.]
 [5. 9.]
 [1. 8.]
 [1. 5.]
 [1. 9.]]
[[1.]
 [9.]
 [8.]
 [5.]
 [9.]]
[[ 3.  1. inf  9.]
 [ 5.  9.  1.  7.]
 [ 1.  8.  6.  2.]
 [ 1.  5.  8.  0.]
 [ 1.  9.  5. -1.]]
[ 9.  6. -1.]
[ 9.  6. -1.]
[9. 6. 9.]


In [36]:
# are all values in column 0 smaller than column 1?
print(np.all(sims[:,0] < sims[:,1]))
# anothe way
print((sims[:,0] < sims[:,1]).all())

False
False


In [35]:
# find out the elements that are close to a given value
# By default, the tolerance is 1e-05 (relative tol), 1e-08 (absolute tol)
print(sims[np.isclose(sims, 0.999)])
print(sims[np.isclose(sims, 0.999, atol=0.1)])

[]
[1. 1. 1. 1. 1.]


In [49]:
# get the indices and the value of the N-th largest elements
# demonstration
a = np.array([9, 4, 4, 3, 3, 9, 0, 4, 6, 0])
print(a)

# The N value
target_rank = 4 
print("The top {} largest elements".format(target_rank))
# this returns the top N largest elements, but not sorted
# this function runs in linear time in the worst case, but the returned indices are not sorted
# argpartition will divide the array into two parts, with the second parts larger than the first part.
ind = np.argpartition(a, -target_rank)[-target_rank:]
print(a[ind])
print("Is this sorted? {}".format(np.all(a[:-1] <= a[1:])))

print("Getting the {}-th largest elements and the index".format(target_rank))
print(np.sort(a[ind]))
print(ind[np.argsort(a[ind])])
print("The value: {}".format(np.sort(a[ind])[0]))
# by default, np.argsort will sort the array in ascending order
print("The index of the original array: {}".format(ind[np.argsort(a[ind])][0]))

[9 4 4 3 3 9 0 4 6 0]
The top 4 largest elements
[4 9 6 9]
Is this sorted? False
Getting the 4-th largest elements and the index
[4 6 9 9]
[1 8 5 0]
The value: 4
The index of the original array: 1


In [None]:
# get the indices and the value of the N-th largest elements
# summary

# Method 1: if the array is short and computing efficiency is not a concern
a = np.array([9, 4, 4, 3, 3, 9, 0, 4, 6, 0])
N = 4
# note that argsort returns the last index if there is a tie
print("The N-th largest element: value = {}, index = {}".format(np.sort(a)[-N], np.argsort(a)[-N]))

# Method 2: if computing efficiency is a concern
ind = np.argpartition(a, -N)[-N:]
print(ind)
# note that if there is a tie of the N-largest value, the first index is returned
print("The N-th largest element: value = {}, index = {}".format(np.sort(a[ind])[0], ind[np.argsort(a[ind])[0]]))

# Other methods: see https://stackoverflow.com/a/10337643/4667568

The N-th largest element: value = 4, index = 7
[1 5 8 0]
The N-th largest element: value = 4, index = 1


In [57]:
# Is an array sorted?
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
a = np.array([1,2,3,4,9])
print(is_sorted(a))
a = np.array([1,2,1,4,9])
print(is_sorted(a))

True
False
