# Article "What If .apply() to Your Entire Pandas Dataframe Is Too Slow?"
This notebook shows the codes to replicate the results in my Medium article,<br>
"What If .apply() to Your Entire Pandas Dataframe Is Too Slow?"

In [12]:
import pandas as pd
import numpy as np
import timeit

## Create initial toy dataset

In [4]:
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,category,radius,sweetness
0,apple,3,1
1,pear,4,2
2,peach,2,3


In [6]:
# output to csv for Medium
df.to_csv('tab1.csv')

## Add a new column 'diameter'
### Use .apply()

In [8]:
df['diameter'] = df['radius'].apply(lambda x: x*2)
df

Unnamed: 0,category,radius,sweetness,diameter
0,apple,3,1,6
1,pear,4,2,8
2,peach,2,3,4


In [16]:
# Timing
setup_code = """
import pandas as pd
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
"""

mycode = '''
df['radius'].apply(lambda x: x*2)
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of For Loop is 0.5491451249999955


In [9]:
# output to csv for Medium
df.to_csv('tab2.csv')

### Not using .apply()

In [10]:
df['diameter'] = df['radius']*2
df

Unnamed: 0,category,radius,sweetness,diameter
0,apple,3,1,6
1,pear,4,2,8
2,peach,2,3,4


In [11]:
# output to csv for Medium
df.to_csv('tab3.csv')

In [17]:
# Timing
setup_code = """
import pandas as pd
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
"""

mycode = '''
df['radius']*2
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of For Loop is 0.32115258299972993


## Compare and select larger value
compare the 'radius' to 3 and select the larger value

In [18]:
# generate error on purpose
max(df['radius'],3)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### Use .apply()

In [24]:
df['radius_or_3'] = df['radius'].apply(lambda x: max(x,3))
df

Unnamed: 0,category,radius,sweetness,diameter,radius_or_3
0,apple,3,1,6,3
1,pear,4,2,8,4
2,peach,2,3,4,3


In [25]:
# output to csv for Medium
df.to_csv('tab4.csv')

In [26]:
# Timing
setup_code = """
import pandas as pd
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
"""

mycode = '''
df['radius'].apply(lambda x: max(x,3))
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of mycode is 0.5552678340000057


### Not use .apply()

In [19]:
df['radius_or_3'] = np.maximum(df['radius'],3)
df

Unnamed: 0,category,radius,sweetness,diameter,radius_or_3
0,apple,3,1,6,3
1,pear,4,2,8,4
2,peach,2,3,4,3


In [20]:
# output to csv for Medium
df.to_csv('tab4.csv')

In [22]:
# Timing
setup_code = """
import pandas as pd
import numpy as np
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
"""

mycode = '''
np.maximum(df['radius'],3)
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of mycode is 0.3092745419999119


## Add a new column of lists
### Use .apply()


In [28]:
df['sizes'] = df.apply(lambda x: list(range(x.radius_or_3,x.diameter)), axis=1)
df

Unnamed: 0,category,radius,sweetness,diameter,radius_or_3,sizes
0,apple,3,1,6,3,"[3, 4, 5]"
1,pear,4,2,8,4,"[4, 5, 6, 7]"
2,peach,2,3,4,3,[3]


In [29]:
# output to csv for Medium
df.to_csv('tab5.csv')

In [33]:
# Timing
setup_code = """
import pandas as pd
import numpy as np
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
df['diameter'] = df['radius']*2
df['radius_or_3'] = np.maximum(df['radius'],3)
"""

mycode = '''
df.apply(lambda x: list(range(x.radius_or_3,x.diameter)), axis=1)
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of mycode is 1.8389610000003813


### Not use .apply()

In [34]:
def create_range(a,b):
    range_l = np.empty((len(a),1),object)
    for i,val in enumerate(a):
        range_l[i,0] = list(range(val,b[i]))
    return range_l

df['sizes'] = create_range(df['radius_or_3'].values,df['diameter'].values)
df

Unnamed: 0,category,radius,sweetness,diameter,radius_or_3,sizes
0,apple,3,1,6,3,"[3, 4, 5]"
1,pear,4,2,8,4,"[4, 5, 6, 7]"
2,peach,2,3,4,3,[3]


In [35]:
# Timing
setup_code = """
import pandas as pd
import numpy as np
d = {'category': ['apple', 'pear', 'peach'], 'radius': [3, 4, 2], 'sweetness': [1, 2, 3]}
df = pd.DataFrame(data=d)
df['diameter'] = df['radius']*2
df['radius_or_3'] = np.maximum(df['radius'],3)
"""

mycode = '''
def create_range(a,b):
    range_l = np.empty((len(a),1),object)
    for i,val in enumerate(a):
        range_l[i,0] = list(range(val,b[i]))
    return range_l

create_range(df['radius_or_3'].values,df['diameter'].values)
'''
 
# timeit statement
t1 = timeit.timeit(setup=setup_code,
                     stmt = mycode,
                     number = 10000)
print(f"10000 runs of mycode is {t1}")

10000 runs of mycode is 0.07087879200025782


In [36]:
184/7


26.285714285714285