In [114]:
# Import dependencies
import pandas as pd
import numpy as np

In [115]:
# Universal Functions
# Computation on NumPy arrays can be very fast, or it can be very slow.
# Optimizing the computation of Vectorized operations is through use of NumPy's universal
# function (ufuncs)

# Python's default implementation, CPython does some operations very slowly.
# other implementations have been attempted to address this such as PyPI project,
# Cython, and Numba project.

In [116]:
# Timing Code
import numpy as np
rng = np.random.default_rng(seed=1000)

def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0 / values[i]
    return output

values = rng.integers(1, 10, size=5)
compute_reciprocals(values)

array([0.5       , 0.2       , 0.125     , 0.16666667, 0.125     ])

In [117]:
# This implementation probably feels fairly natural.
bigArray = rng.integers(1, 100, size=1_000_000)
%timeit compute_reciprocals(bigArray)

1.46 s ± 52.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [118]:
compute_reciprocals(bigArray)

array([0.0212766 , 0.04761905, 0.04761905, ..., 0.02173913, 0.01470588,
       0.1       ])

In [119]:
compute_reciprocals(values)

array([0.5       , 0.2       , 0.125     , 0.16666667, 0.125     ])

In [120]:
print(1.0/values)

[0.5        0.2        0.125      0.16666667 0.125     ]


In [121]:
%timeit (1.0/bigArray)

3.33 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [122]:
# Looking at the execution time for our big array, we see that it completes orders of
# magnitude faster than the Python loop:

# Vectorized operations in NumPy are implemented via ufuncs, whose main purpose
# is to quickly execute operations on values in NumPy arrays.

In [123]:
# operation between arrays
np.arange(5) / np.arange(1, 6)

array([0.        , 0.5       , 0.66666667, 0.75      , 0.8       ])

In [124]:
# N-dimensions
x = np.arange(9).reshape((3, 3))
2**x

array([[  1,   2,   4],
       [  8,  16,  32],
       [ 64, 128, 256]])

In [125]:
x = np.arange(5)
y = np.empty(5)
np.multiply(x, 10, out=y)
print(y)

[ 0. 10. 20. 30. 40.]


In [126]:
y = np.zeros(10)
np.power(2, x, out=y[::2])
y

array([ 1.,  0.,  2.,  0.,  4.,  0.,  8.,  0., 16.,  0.])

In [127]:
y[::2] = 2**x

In [128]:
y

array([ 1.,  0.,  2.,  0.,  4.,  0.,  8.,  0., 16.,  0.])

In [129]:
# Pandas inherits element-wise (such as addition, subtraction, multiplication etc) and unary operations (such as
#  negation, trigomentry, exponential, etc ...) from NumPy.

# For unary operations, Pandas preserves index and column labels.
# For binary operations, Pandas will automatically align indices when pssing
# the objects to the ufunc.

In [130]:
# Unfuncs: Index Preservation
rng = np.random.default_rng(42)
ser = pd.Series(rng.integers(0, 10, 4))
ser

0    0
1    7
2    6
3    4
dtype: int64

In [131]:
df = pd.DataFrame(rng.integers(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,4,8,0,6
1,2,0,5,9
2,7,7,7,7


In [132]:
# Applying a NumPy ufunc
np.exp(ser)

0       1.000000
1    1096.633158
2     403.428793
3      54.598150
dtype: float64

In [133]:
np.sin(df*np.pi / 4)

Unnamed: 0,A,B,C,D
0,1.224647e-16,-2.449294e-16,0.0,-1.0
1,1.0,0.0,-0.707107,0.707107
2,-0.7071068,-0.7071068,-0.707107,-0.707107


In [134]:
# Index alignment
# Pandas align inputs in the process of performing the operation
# Convenient when working with incomplete data

# General Elections 1996
votes = pd.Series(
    {'Apac': 169_583,
     'Arua': 214_680,
     'Bundibugyo': 53_819,
     'Gulu': 129_200,
    }
)

validVotes = pd.Series(
    {'Apac': 159_332,
     'Arua': 201_446,
     'Bundibugyo': 52_463,
     'Bushenyi': 248_730,
     'Gulu': 123_134,
    }
)

invalidVotes = votes - validVotes
invalidVotes    

Apac          10251.0
Arua          13234.0
Bundibugyo     1356.0
Bushenyi          NaN
Gulu           6066.0
dtype: float64

In [135]:
# Compute percentages
# Using NumPy ufuncs
invalidVotes = np.subtract(votes, validVotes)
invalidVotes

Apac          10251.0
Arua          13234.0
Bundibugyo     1356.0
Bushenyi          NaN
Gulu           6066.0
dtype: float64

In [136]:
# The resulting array contains the union of indices of the two input arrays
# NaN values are used for indices that are not in both arrays
# 

In [137]:
# Timing operations
# This is essential to optimize operations
# READ MORE ABOUT Opitimization of operations with Numba

In [138]:
%timeit np.subtract(votes, validVotes)

288 µs ± 14.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [139]:
%timeit votes - validVotes

275 µs ± 16.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [140]:
%timeit votes.sub(validVotes,)

269 µs ± 21.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [157]:
# Index Alignment in DataFrames
# 1996 Elections
index=['Apac', 'Arua', 'Bundibugyo', 'Bushenyi']
# Candidates
candidates = pd.DataFrame(
    data={'Mayanja': [2_013, 7_252, 322, 1_884],
          'Kaguta': [35_532, 34_769, 51_255, 240_307],
          'Semwogerere': [121_787, 159_425, 886, 6_539]
         },
    index=index
)
candidates

Unnamed: 0,Mayanja,Kaguta,Semwogerere
Apac,2013,35532,121787
Arua,7252,34769,159425
Bundibugyo,322,51255,886
Bushenyi,1884,240307,6539


In [168]:
# Ufunc: Operations between DataFrames and Series
# Votes each candidate got above Mayanja's
candidates.sub(candidates['Mayanja'],
                    axis=0)

Unnamed: 0,Mayanja,Kaguta,Semwogerere
Apac,0,33519,119774
Arua,0,27517,152173
Bundibugyo,0,50933,564
Bushenyi,0,238423,4655


In [174]:
# Indices the first column
candidates.iloc[0]

Mayanja          2013
Kaguta          35532
Semwogerere    121787
Name: Apac, dtype: int64

In [176]:
candidates - candidates.iloc[0]

Unnamed: 0,Mayanja,Kaguta,Semwogerere
Apac,0,0,0
Arua,5239,-763,37638
Bundibugyo,-1691,15723,-120901
Bushenyi,-129,204775,-115248


In [None]:
# These operations preserve indices and columns
# NOTE: Contrast with NumPy Arrays