In [22]:
# initialization
import pandas as pd
from IPython.display import Image
from IPython.core.display import HTML

## the innovators

In [1]:
%%html
<iframe src="https://en.wikipedia.org/wiki/Travis_Oliphant" width="1000" height="500"></iframe>

In [2]:
%%html
<iframe src="https://en.wikipedia.org/wiki/Wes_McKinney" width="1000" height="500"></iframe>

## vectorized operations

### “It’s hardware that makes a machine fast. It’s software that makes a fast machine slow.”
### — Craig Bruce

In [28]:
%%html
<iframe src="https://en.wikipedia.org/wiki/Processor_register#Vector_registers" width="1000" height="500"></iframe>

In [37]:
%%html
<iframe src="https://en.wikipedia.org/wiki/Single_instruction%2C_multiple_data" width="1000" height="500"></iframe>

In [31]:
%%html
<iframe src="https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs" width="1000" height="500"></iframe>

In [None]:
# numpy underlies pandas; and because numpy implements vectorized operations,
# pandas can make use of vectorized operations.
# conceived of as a matrix, a tabular dataset is made of row and column vectors.
# because each column vector is made up of values of the same datatype,
# we can apply vectorized operation to them.
# this allows us to write a single statement to work with all the data of a single column. 
# and the hardware parallelization of SIMD registers,
# which vectorized operations make use of, greatly speeds up a vectorized operation.

In [38]:
testList = list(range(100000))

In [40]:
testList[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [41]:
testList = 2 * testList # not illegal, but...
testList[:10] 

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [42]:
testSeries = pd.Series(range(100000))

In [43]:
testSeries[:10]

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [44]:
testSeries = 2 * testSeries
testSeries[:10]

0     0
1     2
2     4
3     6
4     8
5    10
6    12
7    14
8    16
9    18
dtype: int64

In [45]:
# to do the equivalent of multiplying a scalar by a vector
# requires a for-loop
for index in range(len(testList)):
    testList[index] = testList[index] * 2
testList[:10]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

### the difference in using vectorized operations lies in execution time...

In [50]:
%%timeit
testList = list(range(100000))
for index in range(len(testList)):
    testList[index] = testList[index] * 2

17.7 ms ± 4.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
%%timeit
testSeries = pd.Series(range(100000))
testSeries = testSeries * 2

440 µs ± 830 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## boolean selection

In [15]:
# boolean selection
pd.Series(range(10, 21))

0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
dtype: int64

In [16]:
s = pd.Series(range(10, 21))
s > 15 # which rows have values greater than 15?

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7      True
8      True
9      True
10     True
dtype: bool

In [19]:
# select rows in which values are greater than 15
logicalResults = s > 15
s[logicalResults]

6     16
7     17
8     18
9     19
10    20
dtype: int64

In [18]:
# we can use logical operators
s[(s > 15) & (s < 18)]

6    16
7    17
dtype: int64

In [20]:
# are all items >= 9?
(s >= 9).all()

True

In [10]:
pd.Series.all?

## fancy indexing

In [58]:
# initialization
import numpy as np

array([ 1, 63, 59, 20, 32, 75, 57, 21, 88, 48])

In [60]:
# the next statement sets the seed so output is invariant.
# this is useful when you need reproducible output.
# uncomment and set a seed if this is what you want.
rand = np.random.RandomState(42) # life, the universe, and everything

# select 10 ints randomly from the first 100 natural numbers
# put into an ndarray, and make the ndarray the value of a variable named 'x'
x = rand.randint(100, size=10)
x # show x

array([61, 50, 54, 63,  2, 50,  6, 20, 72, 38])

In [61]:
# we can extract values from x by indices and put them into a list...
[x[3], x[7], x[2]]

[63, 20, 54]

In [62]:
# ...or we can pass an array of indices into x extract the values we want:
ind = [3, 7, 4]
x[ind]

array([63, 20,  2])

In [63]:
# this is fancy indexing.
# note that the shape of the result is determined by the shape of the index array...
ind = np.array([[3, 7],
                [4, 5]])
x[ind]

array([[63, 20],
       [ 2, 50]])

In [64]:
# ...and not by the shape of the ndarray that the values are taken from.
# for the next demonstration, let's create a matrix:
X = np.arange(12).reshape((3, 4))
X

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [65]:
# and now let's create a fancy index by row and column of the values we want to extract:
row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
X[row, col]

array([ 2,  5, 11])