In [1]:
def my_sq(x):
    return x ** 2

In [2]:
my_sq(2)

4

In [21]:
assert my_sq(2) == 3

AssertionError: 

In [22]:
assert my_sq(2) == 4

In [5]:
import pandas as pd

In [7]:
# Creating dataframes
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})

In [8]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [9]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [10]:
df['a'].apply(my_sq) # pass in the function (no need round brackets)

0    100
1    400
2    900
Name: a, dtype: int64

In [11]:
my_sq(df['a']) # this would not work if data needs to be processed with regex, etc

0    100
1    400
2    900
Name: a, dtype: int64

In [12]:
def my_exp(x, e):
    return x ** e

In [14]:
df['a'].apply(my_exp, e=4)

0     10000
1    160000
2    810000
Name: a, dtype: int64

In [15]:
def print_me(x):
    print(x)

In [16]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [17]:
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [18]:
df.apply(print_me, axis='columns') # You almost don't want to do this

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


0    None
1    None
2    None
dtype: object

In [19]:
def avg_3(x, y, z):
    return (x + y + z) / 3

In [20]:
df.apply(avg_3) # apply passes everything into the one argument

TypeError: ("avg_3() missing 2 required positional arguments: 'y' and 'z'", 'occurred at index a')

In [23]:
lambda x: x + 3

<function __main__.<lambda>(x)>

In [24]:
import numpy as np

In [25]:
def avg_3_apply(col):
    return np.mean(col)

In [26]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [27]:
df.apply(np.mean)

a    20.0
b    30.0
dtype: float64

In [28]:
def avg_3(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
# bad practice

In [29]:
df.apply(avg_3)

a    20.0
b    30.0
dtype: float64

In [30]:
df['a'] + df['b'] # broadcasting

0    30
1    50
2    70
dtype: int64

In [31]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [32]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # np.nan or np.NAN
    else:
        return (x + y) / 2

In [33]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [34]:
avg_2_mod(20, 100)

nan

In [36]:
avg_2_mod(5, 10)

7.5

In [37]:
avg_2_mod(df.a, df.b)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [39]:
avg_2_mod_vec = np.vectorize(avg_2_mod) # converts series into numbers but if object exists, will return object

In [40]:
avg_2_mod_vec(df.a, df.b)

array([15., nan, 35.])

In [41]:
@np.vectorize # decorator does the same as avg_2_mod_vec = np.vectorize(avg_2_mod)
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN # np.nan or np.NAN
    else:
        return (x + y) / 2

In [42]:
import numba

In [43]:
@numba.vectorize # decorator does the same as avg_2_mod_vec = np.vectorize(avg_2_mod)
def avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN # np.nan or np.NAN
    else:
        return (x + y) / 2

In [44]:
avg_2_mod_numba(df.a, df.b)

ValueError: [1mcannot determine Numba type of <class 'pandas.core.series.Series'>[0m

In [45]:
avg_2_mod_numba(df.a.values, df.b.values)

array([15., nan, 35.])

In [49]:
%%timeit
avg_2_mod(df.a, df.b)

107 µs ± 2.38 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [50]:
%%timeit
avg_2_mod_numba(df.a.values, df.b.values)

8.21 µs ± 70.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
