write function and apply to every row

In [1]:
import pandas as pd

In [14]:
def avg_func(x, y):
    return (x + y) / 2

In [15]:
avg_func(10, 20)

15.0

use assert to do testing. if it gives no output then its True
if gives error output then False

In [19]:
assert avg_func(10, 20) == 15

manually create a DataFrame

In [4]:
df = pd.DataFrame({
    'a': [10, 20, 30], #key would be col and value as row
    'b': [20, 30, 40]
})

In [46]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [40]:
#broadcasting will do the numeric operation to every row
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [50]:
def sqr_func(x):
    return x ** 2

In [51]:
sqr_func(4)

16

In [52]:
df['a'].apply(sqr_func)

0    100
1    400
2    900
Name: a, dtype: int64

In [53]:
def my_exp(x, y):
    return x ** y

In [54]:
my_exp(2, 2)

4

In [57]:
df['a'].apply(my_exp, y=4)

0     10000
1    160000
2    810000
Name: a, dtype: int64

In [59]:
def print_me(x):
    print(x)

In [60]:
#the func passed the series a and b from dataframe entirely to x
#one col passed at once not 10 then 20..30.. the series a passed
#apply works column by column
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [61]:
def avg_3(x, y, z):
    return (x+y+z) / 3

In [62]:
#the error of missing arg is given because apply takes col by col and
#the df has only col a and b 
df.apply(avg_3)

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

In [2]:
import numpy as np

In [64]:
def avg_3_apply(col):
    return np.mean(col)

In [65]:
#numpy can handle the above error
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [66]:
#we can use prebuilt func instead of creating
df['a'].mean()

20.0

In [67]:
df['a'] + df['b']

0    30
1    50
2    70
dtype: int64

In [70]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [69]:
#func to take a and b value and calculate
def avg_2_mod(x, y):
    if(x == 20):
        return np.NaN #can be written as nan, NAN
    else:
        return (x+y) / 2

In [72]:
#error is given bcos the func isn't vectorized 
#the func won't be able to handle when two series passed at once so needs to vectorized
avg_2_mod(df['a'], df['b'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [73]:
#solution of above using NumPy to vectorize
avg_2_mod_vec = np.vectorize(avg_2_mod) #needs assigned to a var or func

In [5]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [74]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

In [11]:
#using decorating instead of vect and assigning
@np.vectorize 
def avg_2_vect(x, y):
    if(x == 20):
        return np.NaN
    else:
        return (x+y) / 2

In [12]:
avg_2_vect(df['a'], df['b'])

array([15., nan, 35.])

In [14]:
#time code using %%timeit
%%timeit
avg_2_mod(df['a'], df['b'])

30 µs ± 2.09 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
%%timeit
avg_2_vect(df['a'], df['b'])

28.7 µs ± 1.49 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Exercise

In [17]:
tbl3 = pd.read_csv('table3.csv')
tbl3

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272
5,China,2000,213766/1280428583


Write a function that takes a value of rate and parses out the total population. Set extracted population to DF


In [18]:
tbl3.dtypes

country    object
year        int64
rate       object
dtype: object

In [26]:
def extract_pop(rate, delim='/', position=1): 
    pop = rate.split(delim)[position] 
    return int(pop)

In [27]:
#test
assert extract_pop('123/456') == 456

In [34]:
tbl3['rate'].apply(extract_pop)

0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: int64

In [35]:
tbl3['population'] = tbl3['rate'].apply(extract_pop)
tbl3

Unnamed: 0,country,year,rate,population
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272
5,China,2000,213766/1280428583,1280428583
