In [2]:
# empty function
def my_func(x):
    pass

In [3]:
# defining a function that give square of an input value
def sq_func(x):
    return x**2

In [4]:
# testing the user defined function
sq_func(5)

25

In [5]:
# checking the function - if no error - good
assert sq_func(9) == 81

In [6]:
# checking the function - if error - not good
assert sq_func(9) == 80

AssertionError: 

In [7]:
# importing resources
import pandas as pd

In [9]:
# creating a dataframe
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [30, 20, 10]
})

In [10]:
df

Unnamed: 0,a,b
0,10,30
1,20,20
2,30,10


In [11]:
# checking values of one column in a df
df['b']

0    30
1    20
2    10
Name: b, dtype: int64

In [12]:
# adding 2 to entire column values
df['b']+2

0    32
1    22
2    12
Name: b, dtype: int64

In [13]:
# applying the user defined function to entire column
df['b'].apply(sq_func)

0    900
1    400
2    100
Name: b, dtype: int64

In [14]:
# defining another user-defined function
def test_func(a, b):
    return a**b

In [15]:
# checking what is the given name (in this case it is a function, as shown by the output)
test_func

<function __main__.test_func(a, b)>

In [16]:
test_func(2, 6)

64

In [17]:
df

Unnamed: 0,a,b
0,10,30
1,20,20
2,30,10


In [18]:
# applying the second user defined function.
# a is the value in the column, and b is given by us
df['a'].apply(test_func, b=2)

0    100
1    400
2    900
Name: a, dtype: int64

In [19]:
# applying the function with a^3
df['a'].apply(test_func, b=3)

0     1000
1     8000
2    27000
Name: a, dtype: int64

In [20]:
# another user defined function
def print_me(x):
    print(x)

In [21]:
# testing the function
print_me(10)

10


In [22]:
# applying the fuction to see how does it print the df
# it goes column by column
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    30
1    20
2    10
Name: b, dtype: int64


a    None
b    None
dtype: object

In [23]:
# user-defined function
def avg_3(x, y, z):
    return (x+y+z)/3

In [24]:
avg_3(1, 2, 3)

2.0

In [25]:
# cannot apply like this, as the function takes just the first value of the variable from the df
# we have to provide the other values
df.apply(avg_3)

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

In [26]:
# The function takes just the first value of the variable from the df
# We have to provide the other values
df.apply(avg_3, y=2, z=3)

Unnamed: 0,a,b
0,5.0,11.666667
1,8.333333,8.333333
2,11.666667,5.0


In [27]:
import numpy as np

In [28]:
# a user-defined function that will give us mean of the whole column
def avg_3_apply(col):
    return np.mean(col)

In [29]:
df

Unnamed: 0,a,b
0,10,30
1,20,20
2,30,10


In [30]:
# applying the function to get the mean of the column
df.apply(avg_3_apply)

a    20.0
b    20.0
dtype: float64

In [33]:
# we know functions are applied on dataframe column by column
# this user-defined function gives the average value of 1, 2, and 3rd row value for every column
# 1st value is indexed 0 in python, i.e. why [0]
def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x+y+z)/3

In [34]:
# applying the function
df.apply(avg_3_apply)

a    20.0
b    20.0
dtype: float64

In [35]:
# new function defined - with a conditional sentence (if)
def test2(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x+y)/2

In [36]:
# function will not work like this as two values should be provided for x and y
df.apply(test2)

TypeError: test2() missing 1 required positional argument: 'y'

In [37]:
# function will not work like this as the function needs to be vectorized first
test2(df['a'], df['b'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [38]:
test2_vec = np.vectorize(test2)

In [39]:
# function will not work like this as two values should be provided for x and y
df.apply(test2_vec)

TypeError: test2() missing 1 required positional argument: 'y'

In [40]:
# now we will get the output
# a vectorized function will always give us an array as output when applied
test2_vec(df['a'], df['b'])

array([20., nan, 20.])

In [41]:
# we can vectorize right when we defined the function using @np.vectorize line before the function
@np.vectorize
def avg_2(x, y):
    if x == 20:
        return np.NaN
    else:
        return (x+y)/2

In [42]:
avg_2(5,4)

array(4.5)

In [43]:
# initially vectorized function gives us same output
avg_2(df['a'], df['b'])

array([20., nan, 20.])

In [44]:
# importing resources
import numba

In [45]:
# can use numba to vectorize, but the columns have to be converted to values first
@numba.vectorize
def avg_2_numba(x, y):
    if x == 20:
        return np.NaN
    else:
        return (x+y)/2

In [46]:
# the function will not work like this as columns have to be 
# converted to values using .values command
avg_2_numba(df['a'], df['b'])

ValueError: [1mcannot determine Numba type of <class 'pandas.core.series.Series'>[0m

In [47]:
avg_2_numba(df['a'].values, df['b'].values)

array([20., nan, 20.])

In [51]:
# numba is fastest, followed by np, followed by regular functions

%%timeit
avg_2_numba(df['a'].values, df['b'].values)

5.46 µs ± 145 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [52]:
%%timeit
avg_2(df['a'], df['b'])

53.7 µs ± 542 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [53]:
%%timeit
test2(2,3)

182 ns ± 4.81 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [55]:
# excercise using split function
tb3 = pd.read_csv('C:/Users/jaska/Desktop/tutorial/data/table3.csv')

In [56]:
tb3.head()

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272


In [57]:
# defining function to split the values in a column based on a 'splitter'
def split_func(x):
    return x.str.split('/', expand=True)

In [58]:
# applying the function to a column of the df
split_func(tb3['rate'])

Unnamed: 0,0,1
0,745,19987071
1,2666,20595360
2,37737,172006362
3,80488,174504898
4,212258,1272915272
5,213766,1280428583


In [59]:
# assigning a variable name to the output of the function
tb3_split = split_func(tb3['rate'])

In [60]:
# adding column to the existing df (tb3) 
# and assigning it the second column of the output (indexed as [1])
tb3['population'] = tb3_split[1]

In [61]:
tb3.head()

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272


In [62]:
# dropping the rate column
tb3.drop('rate', axis = 'columns')

Unnamed: 0,country,year,population
0,Afghanistan,1999,19987071
1,Afghanistan,2000,20595360
2,Brazil,1999,172006362
3,Brazil,2000,174504898
4,China,1999,1272915272
5,China,2000,1280428583


In [63]:
# another way to do the same thing
# here we split a column and extract the second column from the resulting output ([1])
def another_way(x):
    pop = x.split('/')[1]
    return pop

In [64]:
tb3 = pd.read_csv('C:/Users/jaska/Desktop/tutorial/data/table3.csv')

In [65]:
tb3['rate'].apply(another_way)

0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: object

In [66]:
pops = tb3['rate'].apply(another_way)

In [67]:
tb3['pop2'] = pops

In [68]:
tb3.head()

Unnamed: 0,country,year,rate,pop2
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272


In [69]:
# one more way to do the same thing
# similar to last function 
# but we define the 'spitter' and column to extract in the first line of function itself
def yet_another_way(x, delim = '/', position = 1):
    pop = x.split(delim)[position]
    return int(pop)

In [70]:
# adding the second part of the splitted column (population) 
# to the existing df as a new column in a single line
tb3['pop3'] = tb3['rate'].apply(yet_another_way)

In [71]:
tb3.head()

Unnamed: 0,country,year,rate,pop2,pop3
0,Afghanistan,1999,745/19987071,19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360,20595360
2,Brazil,1999,37737/172006362,172006362,172006362
3,Brazil,2000,80488/174504898,174504898,174504898
4,China,1999,212258/1272915272,1272915272,1272915272
