### Link: [Satish Gupta](https://towardsdatascience.com/apply-function-to-pandas-dataframe-rows-76df74165ee4)

In [4]:
import pandas as pd
import numpy as np

from faker import Faker

In [5]:
fake = Faker()
print (fake.email())
print(fake.country())
print(fake.name())
print(fake.text())
print(fake.latitude(), fake.longitude())
print(fake.url())

figueroajohn@doyle.net
Benin
Amanda Davis
Own night respond red information last everything. Serve civil institution.
Democratic shake bill here. Suggest page southern role movie win.
83.9556745 -141.666070
http://www.ford.net/


In [6]:
def generate_test_data(size: int, days: int = 30):
  fake = Faker()
  Faker.seed(42)

  PRIORITIES = {
    0: 'HIGH',
    1: 'MEDIUM',
    2: 'LOW'
  }

  return pd.DataFrame({
    'task_name': [
      f'Task {i + 1}'
      for i in range(size)
    ],
    'due_date': [
      fake.date_between(start_date='today', end_date=f'+{days}d')
      for _ in range(size)
    ],
    'priority': [
      PRIORITIES[fake.pyint(min_value=0, max_value=(len(PRIORITIES) - 1))]
      for i in range(size)
    ]
  })


In [7]:
# Try generate_test_data
tmp_df = generate_test_data(10, 5)
tmp_df.info()
tmp_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   task_name  10 non-null     object
 1   due_date   10 non-null     object
 2   priority   10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes


Unnamed: 0,task_name,due_date,priority
0,Task 1,2021-06-27,LOW
1,Task 2,2021-06-24,LOW
2,Task 3,2021-06-24,LOW
3,Task 4,2021-06-28,HIGH
4,Task 5,2021-06-25,LOW
5,Task 6,2021-06-25,MEDIUM
6,Task 7,2021-06-25,HIGH
7,Task 8,2021-06-24,HIGH
8,Task 9,2021-06-28,HIGH
9,Task 10,2021-06-24,HIGH


___

In [13]:
K_MAX = 21
# Generate a million rows. Use sample from it to create various size data sets
# Will take some time as Faker will be called sequencially.
test_data_set = generate_test_data(1 + 2 ** K_MAX, 30)
test_data_set.head(5)

Unnamed: 0,task_name,due_date,priority
0,Task 1,2021-06-29,LOW
1,Task 2,2021-06-25,HIGH
2,Task 3,2021-07-07,MEDIUM
3,Task 4,2021-07-05,LOW
4,Task 5,2021-07-04,HIGH


In [14]:
def test_data_sample(size: int):
    return test_data_set.sample(n=size).copy().reset_index(drop=True)

In [15]:
# Test sample of size 10
test_data_sample(10).head(5)

Unnamed: 0,task_name,due_date,priority
0,Task 572555,2021-07-08,HIGH
1,Task 519535,2021-06-30,HIGH
2,Task 1884514,2021-07-11,HIGH
3,Task 818383,2021-06-25,MEDIUM
4,Task 583372,2021-07-21,MEDIUM


In [16]:
test_data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2097153 entries, 0 to 2097152
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   task_name  object
 1   due_date   object
 2   priority   object
dtypes: object(3)
memory usage: 48.0+ MB


In [19]:
# Instead of str, priority can be stored as Pandas categorical type:
priority_dtype = pd.api.types.CategoricalDtype(categories=['LOW', 'MEDIUM', 'HIGH'],ordered=True)
test_data_set['priority'] = test_data_set['priority'].astype(priority_dtype)

In [20]:
test_data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2097153 entries, 0 to 2097152
Data columns (total 3 columns):
 #   Column     Dtype   
---  ------     -----   
 0   task_name  object  
 1   due_date   object  
 2   priority   category
dtypes: category(1), object(2)
memory usage: 34.0+ MB


#### Size is reduced from `48MB` to `34MB`

In [23]:
import datetime 

def eisenhower_action(is_important: bool, is_urgent: bool) -> int:
    return 2 * is_important + is_urgent


cutoff_date = datetime.date.today() + datetime.timedelta(days=2)
eisenhower_action(
  test_data_set.loc[0].priority == 'HIGH',
  test_data_set.loc[0].due_date <= cutoff_date
)

0

## Method 1. Loop Over All Rows of a DataFrame

In [30]:
def loop_impl(df):
    cutoff_date = datetime.date.today() + datetime.timedelta(days=2)
    result = []
    for i in range(len(df)):
        row = df.iloc[i]
        result.append(
          eisenhower_action(
            row.priority == 'HIGH', row.due_date <= cutoff_date)
        )
    return pd.Series(result)

In [34]:
%timeit data_sample['action_loop'] = loop_impl(data_sample)

TypeError: object of type 'function' has no len()

In [35]:
def iterrows_impl(df):
    cutoff_date = datetime.date.today() + datetime.timedelta(days=2)  
    return pd.Series(eisenhower_action(
        row.priority == 'HIGH', row.due_date <= cutoff_date)
    for index, row in df.iterrows()
  )

In [36]:
%timeit data_sample['action_iterrow'] = iterrows_impl(data_sample)

NameError: name 'data_sample' is not defined