## Performance difference between Numpy array and Python List

In [1]:
import numpy as np

In [2]:
my_arr = np.arange(1000000)
my_list = list(range(1000000))

In [3]:
%time for _ in range(10): my_arr = my_arr * 2

Wall time: 25.6 ms


In [4]:
%time for _ in range(10): my_list2 = [x * 2 for x in my_list]

Wall time: 943 ms


## 4.1 The NumPy ndarray: A Multidimensional Array Object

In [10]:
# generate some random data using np.random and passing randn(2,3).  randn is 
# a random sample from the "standard normal" distribution.  The 2, 3 means two
# rows and 3 columns

data = np.random.randn(2, 3)
data

array([[-0.19957434, -0.47278356, -1.13333717],
       [-0.89966382,  0.96840984,  1.41074012]])

In [12]:
# note that you can perform calculations to all elements at once
data * 10

array([[ -1.99574342,  -4.72783564, -11.33337173],
       [ -8.99663822,   9.6840984 ,  14.1074012 ]])

In [14]:
# note that each element is added to each corresponding element by position
data + data

array([[-0.39914868, -0.94556713, -2.26667435],
       [-1.79932764,  1.93681968,  2.82148024]])

In [16]:
data.shape # returns tuple indicating size of array (rows, columns)

(2, 3)

In [17]:
data.dtype # all elements of the ndarray must be of the same type

dtype('float64')

### Creating ndarrays

In [18]:
data1 = [6, 7.5, 8, 0, 1]

In [19]:
arr1 = np.array(data1)

In [21]:
arr1 # note that it all elements were coerced to float?

array([ 6. ,  7.5,  8. ,  0. ,  1. ])

In [25]:
arr1.dtype

dtype('float64')

In [27]:
data3 = [4, 5]
arr3 = np.array(data2)
arr3.dtype # I was correct, since arr3 is type int32

dtype('int32')

In [28]:
# continuing with book example

In [30]:
# nested sequences become a multidimensional array
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [36]:
arr2.ndim # number of dimensions

2

In [37]:
arr2.shape # (rows, columns) for 2d array

(2, 4)

In [41]:
np.zeros(10) # array of zeroes

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [40]:
np.zeros((3, 6)) # pass a tuple to creat earray of different shape

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [44]:
np.empty((2, 3, 4)) # it seems like it is (z, x, y)

array([[[  6.23042070e-307,   1.86918699e-306,   1.69121096e-306,
           1.60219035e-306],
        [  1.37962388e-306,   1.11261027e-306,   1.42419938e-306,
           7.56603881e-307],
        [  8.45603441e-307,   8.45590539e-307,   1.11261570e-306,
           1.29062229e-306]],

       [[  7.56599807e-307,   8.90104239e-307,   1.24610383e-306,
           1.69118108e-306],
        [  8.06632139e-308,   1.20160711e-306,   1.69119330e-306,
           1.29062229e-306],
        [  6.89804133e-307,   1.11261162e-306,   8.34443015e-308,
           1.42404727e-306]]])

In [47]:
np.arange(15) # array-valued version of the built-in Python range function

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### Data Types for ndarrays

In [48]:
arr1 = np.array([1, 2, 3], dtype = np.float64)
arr1

array([ 1.,  2.,  3.])

In [50]:
arr2 = np.array([1, 2, 3], dtype = np.int32)
arr2

array([1, 2, 3])

In [51]:
arr = np.array([1, 2, 3, 4, 5])
arr.dtype

dtype('int32')

In [55]:
float_arr = arr.astype(np.float64) # casting an array to a different format
float_arr.dtype

dtype('float64')

In [56]:
arr.dtype # seems like astype returns a copy

dtype('int32')

In [57]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
arr

array([  3.7,  -1.2,  -2.6,   0.5,  12.9,  10.1])

In [58]:
arr.astype(np.int32) # when floating point numbers to int, the decimal part is
                     # truncated

array([ 3, -1, -2,  0, 12, 10])

In [60]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
numeric_strings

array([b'1.25', b'-9.6', b'42'],
      dtype='|S4')

In [64]:
numeric_strings.astype(float) # calling astype always creates a new array, even
# if the new dtype is the same as the old dtype

array([  1.25,  -9.6 ,  42.  ])

### Arithmetic with Numpy Arrays

In [65]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr

array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])

In [67]:
arr * arr # operations are done element-wise, called vectorization

array([[  1.,   4.,   9.],
       [ 16.,  25.,  36.]])

In [68]:
arr - arr

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [69]:
1/arr

array([[ 1.        ,  0.5       ,  0.33333333],
       [ 0.25      ,  0.2       ,  0.16666667]])

In [70]:
arr ** 0.5

array([[ 1.        ,  1.41421356,  1.73205081],
       [ 2.        ,  2.23606798,  2.44948974]])

In [72]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
arr2

array([[  0.,   4.,   1.],
       [  7.,   2.,  12.]])

In [74]:
arr2 > arr

array([[False,  True, False],
       [ True, False,  True]], dtype=bool)

### Basic Indexing and Slicing

In [76]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [77]:
arr[5]

5

In [78]:
arr[5:8]

array([5, 6, 7])

In [81]:
arr[5:8] = 12
arr
# unlike Python's built-in lists, array slices are views on the original array.
# Data is not copied but changed in place

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [84]:
arr_slice = arr[5:8]
arr_slice

array([12, 12, 12])

In [85]:
arr_slice[1] = 12345
arr_slice

array([   12, 12345,    12])

In [87]:
arr # data in arr changed to reflect the modification done to arr_slice

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])

In [88]:
arr_slice[:] = 64

In [89]:
arr # again, data was modified

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [95]:
arr_slice_copy = arr[5:8].copy() # use .copy() to create a copy and not modify
arr_slice_copy                   # the original data

array([64, 64, 64])

In [96]:
arr_slice_copy[:] = 999
arr_slice_copy

array([999, 999, 999])

In [97]:
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [99]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2] # each element is a one dimensional array

array([7, 8, 9])

In [100]:
arr2d[0][2]

3

In [101]:
arr2d[0, 2]

3

In [103]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [104]:
arr3d[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [105]:
old_values = arr3d[0].copy()

In [107]:
arr3d[0] = 42
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [109]:
arr3d[0] = old_values
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [111]:
arr3d[1, 0] # z = 1, x = 0.  Second 2d array, 1st array

array([7, 8, 9])

### Indexing with slices

In [112]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [113]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [114]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [115]:
arr2d[:, :1]

array([[1],
       [4],
       [7]])

### Boolean Indexing

In [116]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)

In [119]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'],
      dtype='<U4')

In [120]:
data

array([[  4.18190924e-01,  -1.00529550e+00,   1.04651262e+00,
         -3.05444686e-01],
       [ -5.59878401e-01,  -1.12666433e+00,  -2.17131136e+00,
         -2.92421457e-01],
       [  1.81442935e+00,  -5.37197328e-01,  -2.02227553e+00,
         -3.85641974e-02],
       [ -1.25910329e+00,  -8.56627806e-03,  -6.12182341e-01,
         -5.45651186e-01],
       [  3.90883065e-01,  -2.06180512e-01,   1.07499888e+00,
         -8.32350513e-01],
       [  1.05238423e+00,  -2.06498762e-01,  -1.88115789e+00,
          8.63970259e-04],
       [  1.33100344e-01,  -8.14097383e-01,   6.72334362e-01,
          6.10437550e-01]])

In [121]:
names == 'Bob'

array([ True, False, False,  True, False, False, False], dtype=bool)

In [122]:
names[names == 'Bob']

array(['Bob', 'Bob'],
      dtype='<U4')

In [123]:
data[names == 'Bob']

array([[ 0.41819092, -1.0052955 ,  1.04651262, -0.30544469],
       [-1.25910329, -0.00856628, -0.61218234, -0.54565119]])

In [124]:
data[names == 'Bob', 2:]

array([[ 1.04651262, -0.30544469],
       [-0.61218234, -0.54565119]])

In [125]:
# to select everything but 'Bob' we can either use != or negate the condition
# using ~:
names != 'Bob'

array([False,  True,  True, False,  True,  True,  True], dtype=bool)

In [126]:
data[names != 'Bob']

array([[ -5.59878401e-01,  -1.12666433e+00,  -2.17131136e+00,
         -2.92421457e-01],
       [  1.81442935e+00,  -5.37197328e-01,  -2.02227553e+00,
         -3.85641974e-02],
       [  3.90883065e-01,  -2.06180512e-01,   1.07499888e+00,
         -8.32350513e-01],
       [  1.05238423e+00,  -2.06498762e-01,  -1.88115789e+00,
          8.63970259e-04],
       [  1.33100344e-01,  -8.14097383e-01,   6.72334362e-01,
          6.10437550e-01]])

In [127]:
data[~(names == 'Bob')]

array([[ -5.59878401e-01,  -1.12666433e+00,  -2.17131136e+00,
         -2.92421457e-01],
       [  1.81442935e+00,  -5.37197328e-01,  -2.02227553e+00,
         -3.85641974e-02],
       [  3.90883065e-01,  -2.06180512e-01,   1.07499888e+00,
         -8.32350513e-01],
       [  1.05238423e+00,  -2.06498762e-01,  -1.88115789e+00,
          8.63970259e-04],
       [  1.33100344e-01,  -8.14097383e-01,   6.72334362e-01,
          6.10437550e-01]])

The Python keywords _and_ and _or_ do not work with boolean arrays.  Use _&_ (and)
and _|_ (or) instead.

In [128]:
mask = (names == 'Bob') | (names == 'Will')
mask

array([ True, False,  True,  True,  True, False, False], dtype=bool)

In [130]:
data[mask] # Selecting data from an array by boolean indexing always creates a
# copy of the data, even if the returned array is unchanged.

array([[ 0.41819092, -1.0052955 ,  1.04651262, -0.30544469],
       [ 1.81442935, -0.53719733, -2.02227553, -0.0385642 ],
       [-1.25910329, -0.00856628, -0.61218234, -0.54565119],
       [ 0.39088307, -0.20618051,  1.07499888, -0.83235051]])

In [133]:
# setting all negative values in data to 0
data[data < 0] = 0
data

array([[  4.18190924e-01,   0.00000000e+00,   1.04651262e+00,
          0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  1.81442935e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  3.90883065e-01,   0.00000000e+00,   1.07499888e+00,
          0.00000000e+00],
       [  1.05238423e+00,   0.00000000e+00,   0.00000000e+00,
          8.63970259e-04],
       [  1.33100344e-01,   0.00000000e+00,   6.72334362e-01,
          6.10437550e-01]])

In [134]:
# setting whole rows or columns using a one-dimensional boolean array
data[names != 'Joe'] = 7

In [135]:
data

array([[  7.00000000e+00,   7.00000000e+00,   7.00000000e+00,
          7.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  7.00000000e+00,   7.00000000e+00,   7.00000000e+00,
          7.00000000e+00],
       [  7.00000000e+00,   7.00000000e+00,   7.00000000e+00,
          7.00000000e+00],
       [  7.00000000e+00,   7.00000000e+00,   7.00000000e+00,
          7.00000000e+00],
       [  1.05238423e+00,   0.00000000e+00,   0.00000000e+00,
          8.63970259e-04],
       [  1.33100344e-01,   0.00000000e+00,   6.72334362e-01,
          6.10437550e-01]])

### Fancy Indexing

Fancy indexing is a term adopted by Numpy to describe indexing using integer
arrays.

In [137]:
arr = np.empty((8, 4))

In [138]:
for i in range(8):
    arr[i] = i
    
arr    

array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 2.,  2.,  2.,  2.],
       [ 3.,  3.,  3.,  3.],
       [ 4.,  4.,  4.,  4.],
       [ 5.,  5.,  5.,  5.],
       [ 6.,  6.,  6.,  6.],
       [ 7.,  7.,  7.,  7.]])

In [140]:
arr[[4, 3, 0, 6]] # select out a subset of rows in a particular order by passing
# a list or ndarray of integers

array([[ 4.,  4.,  4.,  4.],
       [ 3.,  3.,  3.,  3.],
       [ 0.,  0.,  0.,  0.],
       [ 6.,  6.,  6.,  6.]])

In [142]:
arr[[-3, -5, -7]]

array([[ 5.,  5.,  5.,  5.],
       [ 3.,  3.,  3.,  3.],
       [ 1.,  1.,  1.,  1.]])

In [143]:
arr = np.arange(32).reshape((8, 4))

In [144]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]]

array([ 4, 23, 29, 10])

Fancy indexing, unlike slicing, always copies the data into a new array.

### Transposing Arrays and Swapping Axes