In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
                  'Name': ['Alex Smith', 'Anna McDowell', 'Jodie King', 'Liza Foster'],
                  'Gender': ['M','F','F','T'],
                  'Age': [24,18,32,34]  
                })
df

Unnamed: 0,Name,Gender,Age
0,Alex Smith,M,24
1,Anna McDowell,F,18
2,Jodie King,F,32
3,Liza Foster,T,34


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Gender  4 non-null      object
 2   Age     4 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


#### Concept

In [4]:
def printme(x):
    # show me the data type of x 
    print(f"type(x) = {type(x)}")
    # show me the value of x 
    print(f"print(x) = \n{x}")

In [5]:
# Pass entire dataframe to the printme function, each column at a time
df.apply(printme, axis = 0)

type(x) = <class 'pandas.core.series.Series'>
print(x) = 
0       Alex Smith
1    Anna McDowell
2       Jodie King
3      Liza Foster
Name: Name, dtype: object
type(x) = <class 'pandas.core.series.Series'>
print(x) = 
0    M
1    F
2    F
3    T
Name: Gender, dtype: object
type(x) = <class 'pandas.core.series.Series'>
print(x) = 
0    24
1    18
2    32
3    34
Name: Age, dtype: int64


Name      None
Gender    None
Age       None
dtype: object

In [6]:
# Pass entire dataframe to the printme function, each row at a time
df.apply(printme, axis = 1)

type(x) = <class 'pandas.core.series.Series'>
print(x) = 
Name      Alex Smith
Gender             M
Age               24
Name: 0, dtype: object
type(x) = <class 'pandas.core.series.Series'>
print(x) = 
Name      Anna McDowell
Gender                F
Age                  18
Name: 1, dtype: object
type(x) = <class 'pandas.core.series.Series'>
print(x) = 
Name      Jodie King
Gender             F
Age               32
Name: 2, dtype: object
type(x) = <class 'pandas.core.series.Series'>
print(x) = 
Name      Liza Foster
Gender              T
Age                34
Name: 3, dtype: object


0    None
1    None
2    None
3    None
dtype: object

In [7]:
# Use apply on a single ROW 
# NOTE here that apply() does not pass a Series any more and there is no Axis parameter
# Instead the values are each "scalar" elements of the row are passed to the function
df.loc[2].apply(printme)

type(x) = <class 'str'>
print(x) = 
Jodie King
type(x) = <class 'str'>
print(x) = 
F
type(x) = <class 'numpy.int64'>
print(x) = 
32


Name      None
Gender    None
Age       None
Name: 2, dtype: object

In [8]:
# Use apply on a single COLUMN 
# NOTE here that the values passed are not Series any more and there is no Axis parameter
# Instead the values are each "scalar" elements of the column are passed to the function
df.loc[:,'Age'].apply(printme)

type(x) = <class 'int'>
print(x) = 
24
type(x) = <class 'int'>
print(x) = 
18
type(x) = <class 'int'>
print(x) = 
32
type(x) = <class 'int'>
print(x) = 
34


0    None
1    None
2    None
3    None
Name: Age, dtype: object

### Using native functions

In [9]:
df

Unnamed: 0,Name,Gender,Age
0,Alex Smith,M,24
1,Anna McDowell,F,18
2,Jodie King,F,32
3,Liza Foster,T,34


In [10]:
# Find length of each Name
# This means:  for all values x in column 'Name', return len(x) and index them by the original row index
df['Name'].apply(len)

0    10
1    13
2    10
3    11
Name: Name, dtype: int64

In [11]:
# Convert each Age to string
# This means:  for all values x in column 'Age', return str(x) and index them by the original row index
# See the dtype has changed to Object
df['Age'].apply(str)

0    24
1    18
2    32
3    34
Name: Age, dtype: object

In [12]:
# Find the square of age values
# This means:  for all values x in column 'Age', return square(x) and index them by the original row index
# See the dtype has changed to Object
df['Age'].apply(np.square)

0     576
1     324
2    1024
3    1156
Name: Age, dtype: int64

### Using lambda functions

In [13]:
# Convert each Gender code to its value based on a Dictionary
# This means:  for all values x in column 'Gender', return gender_dict[x] and index them by the original row index

gender_dict = {'T' : 'Transgender',
                'O' : 'Other',
                'M' : 'Male',
                'F' : 'Female'}

df['Gender'].apply(lambda x: gender_dict[x])

0           Male
1         Female
2         Female
3    Transgender
Name: Gender, dtype: object

In [14]:
# Convert each name to lower case
# This means:  for all values x in column 'Name', find matching bin using np.digitize(x,bins) function

df['Name'].apply(lambda x: x.lower())

0       alex smith
1    anna mcdowell
2       jodie king
3      liza foster
Name: Name, dtype: object

In [15]:
# Find which age bucket does a person fall if age buckets are (1 = 10-20 yrs, 2 = 20-30yrs, 3 = 30-40yrs)
# This means:  for all values x in column 'Age', find matching bin using np.digitize(x,bins) function

df['Age'].apply(lambda x: np.digitize(x, bins = [10,20,30,40]))

0    2
1    1
2    3
3    3
Name: Age, dtype: int64

In [16]:
# Find all first names from the 'Name' column

df['Name'].apply(lambda x: x.split()[0])

0     Alex
1     Anna
2    Jodie
3     Liza
Name: Name, dtype: object

#### Appending output of apply() as a new column in the dataframe

In [17]:
# Calculate age group using apply as shown above
# The output is a Series with same Row Index set, so we can create a new column of the dataframe and assign it
# using this output. The assignment causes Row Indexes to be matched and the new values appended accordingly

df['AgeGroup'] = df['Age'].apply(lambda x: np.digitize(x, bins = [10,20,30,40]))
df

Unnamed: 0,Name,Gender,Age,AgeGroup
0,Alex Smith,M,24,2
1,Anna McDowell,F,18,1
2,Jodie King,F,32,3
3,Liza Foster,T,34,3


In [18]:
# Split Name into First Name and Last Name and create new columns
df['FirstName'] = df['Name'].apply(lambda x:x.split()[0])
df['LastName'] = df['Name'].apply(lambda x:x.split()[-1])
df

Unnamed: 0,Name,Gender,Age,AgeGroup,FirstName,LastName
0,Alex Smith,M,24,2,Alex,Smith
1,Anna McDowell,F,18,1,Anna,McDowell
2,Jodie King,F,32,3,Jodie,King
3,Liza Foster,T,34,3,Liza,Foster


In [19]:
# The above can also be done WITHOUT apply in one line - it uses the expand parameter from the str 
# function of the String accessor. This expands the result into two columns 
# They are renamed using set_axis and then added together
df[['FirstName1','LastName1']] = df['Name'].str.split(expand = True).set_axis(['FName','LName'], axis =1)
df

Unnamed: 0,Name,Gender,Age,AgeGroup,FirstName,LastName,FirstName1,LastName1
0,Alex Smith,M,24,2,Alex,Smith,Alex,Smith
1,Anna McDowell,F,18,1,Anna,McDowell,Anna,McDowell
2,Jodie King,F,32,3,Jodie,King,Jodie,King
3,Liza Foster,T,34,3,Liza,Foster,Liza,Foster
