# Applying functions to our dataframe

In [2]:
import pandas as pd

df = pd.read_csv("artists.csv")

df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,
3,4,Charles Arnoldi,American,Male,1946.0,
4,5,Per Arnoldi,Danish,Male,1941.0,
...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,
15087,67453,Leng Lin,Chinese,Male,1965.0,
15088,67652,Ellie Nagler,,,,
15089,67694,Glenn Williams,,Male,,


## Applying functions to columns with text

In [3]:
def last_name(name): # "Thiago Serra"
    names = name.split() # ["Thiago", "Serra"]
    last = names[-1] # "Serra"
    return last

last_name("Thiago Serra")    

'Serra'

In [4]:
df["Last Name"] = df["Name"].apply(last_name)

In [5]:
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Last Name
0,1,Robert Arneson,American,Male,1930.0,1992.0,Arneson
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,,Arnaiz
2,3,Bill Arnold,American,Male,1941.0,,Arnold
3,4,Charles Arnoldi,American,Male,1946.0,,Arnoldi
4,5,Per Arnoldi,Danish,Male,1941.0,,Arnoldi
...,...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,,Jianhua
15087,67453,Leng Lin,Chinese,Male,1965.0,,Lin
15088,67652,Ellie Nagler,,,,,Nagler
15089,67694,Glenn Williams,,Male,,,Williams


In [6]:
df["First Name"] = df["Name"].apply(lambda name : name.split()[0])

In [7]:
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Last Name,First Name
0,1,Robert Arneson,American,Male,1930.0,1992.0,Arneson,Robert
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,,Arnaiz,Doroteo
2,3,Bill Arnold,American,Male,1941.0,,Arnold,Bill
3,4,Charles Arnoldi,American,Male,1946.0,,Arnoldi,Charles
4,5,Per Arnoldi,Danish,Male,1941.0,,Arnoldi,Per
...,...,...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,,Jianhua,Liu
15087,67453,Leng Lin,Chinese,Male,1965.0,,Lin,Leng
15088,67652,Ellie Nagler,,,,,Nagler,Ellie
15089,67694,Glenn Williams,,Male,,,Williams,Glenn


In [8]:
import numpy as np

df["Name Conflict"] = np.where(df["Name"] != (df["First Name"]+" "+df["Last Name"]), True, False)

In [9]:
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Last Name,First Name,Name Conflict
0,1,Robert Arneson,American,Male,1930.0,1992.0,Arneson,Robert,False
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,,Arnaiz,Doroteo,False
2,3,Bill Arnold,American,Male,1941.0,,Arnold,Bill,False
3,4,Charles Arnoldi,American,Male,1946.0,,Arnoldi,Charles,False
4,5,Per Arnoldi,Danish,Male,1941.0,,Arnoldi,Per,False
...,...,...,...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,,Jianhua,Liu,False
15087,67453,Leng Lin,Chinese,Male,1965.0,,Lin,Leng,False
15088,67652,Ellie Nagler,,,,,Nagler,Ellie,False
15089,67694,Glenn Williams,,Male,,,Williams,Glenn,False


In [10]:
df["Name Conflict"].value_counts()

False    11397
True      3694
Name: Name Conflict, dtype: int64

In [11]:
df[df["Name Conflict"] == True]

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Last Name,First Name,Name Conflict
9,11,Jean (Hans) Arp,French,Male,1886.0,1966.0,Arp,Jean,True
11,13,J. Arrelano Fischer,Mexican,Male,1911.0,1995.0,Fischer,J.,True
14,18,Artko,,,,,Artko,Artko,True
18,23,Charles Robert Ashbee,British,Male,1863.0,1942.0,Ashbee,Charles,True
20,25,E. M. Ashe,American,Male,1867.0,1941.0,Ashe,E.,True
...,...,...,...,...,...,...,...,...,...
15044,67272,Toyin Ojih Odutola,American,Female,1985.0,,Odutola,Toyin,True
15045,67275,JR,French,,,,JR,JR,True
15046,67283,Polit-Sheer-Form Office (PSFO),,,,,(PSFO),Polit-Sheer-Form,True
15074,67340,M. Henry Jones,American,,1957.0,,Jones,M.,True


## Applying functions to columns with numbers

In [12]:
def boomer(year):
    if year < 1946 or year > 1964:
        return False # This person is either too old (<1946) or young (>1964) to be a Boomer
    return True

boomer(1962)

True

In [13]:
df["Boomer"] = df["Birth Year"].apply(boomer)
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Last Name,First Name,Name Conflict,Boomer
0,1,Robert Arneson,American,Male,1930.0,1992.0,Arneson,Robert,False,False
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,,Arnaiz,Doroteo,False,False
2,3,Bill Arnold,American,Male,1941.0,,Arnold,Bill,False,False
3,4,Charles Arnoldi,American,Male,1946.0,,Arnoldi,Charles,False,True
4,5,Per Arnoldi,Danish,Male,1941.0,,Arnoldi,Per,False,False
...,...,...,...,...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,,Jianhua,Liu,False,True
15087,67453,Leng Lin,Chinese,Male,1965.0,,Lin,Leng,False,False
15088,67652,Ellie Nagler,,,,,Nagler,Ellie,False,True
15089,67694,Glenn Williams,,Male,,,Williams,Glenn,False,True


In [14]:
df["Boomer"].sum()

6177

In [15]:
def boomer(year):
    if year >= 1946 and year <= 1964:
        return True
    return False

df["Boomer"] = df["Birth Year"].apply(boomer)
df["Boomer"].sum()

2323

In [16]:
def boomer(year):
    if pd.isnull(year):
        return False # This captures the case of NaN in "Birth Year"
    if year < 1946 or year > 1964:
        return False # This person is either too old (<1946) or young (>1964) to be a Boomer
    return True

df["Boomer"] = df["Birth Year"].apply(boomer)
df["Boomer"].sum()

2323

In [17]:
def born_between(year, first, last):
    if pd.isnull(year):
        return False # This captures the case of NaN in "Birth Year"
    if year < first or year > last:
        return False # This person is either too old (<first) or young (>last) to be a Boomer
    return True

df["Boomer"] = df["Birth Year"].apply(born_between,args=[1946,1964])
df["Boomer"].sum()

2323

## Applying functions to multiple columns

In [18]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func, axis=0, raw=False, result_type=None, args=(), **kwds) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
    
        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.
    
    raw : bool, default False
        Determines if row or column is passed as a Series or ndarray object:
    
    

In [20]:
def American_Boomer(row):
    if pd.isnull(row["Nationality"]) or pd.isnull(row["Birth Year"]):
        return False # NaN in either column
    if row["Birth Year"] < 1946 or row["Birth Year"] > 1964:
        return False # If birth year is out of range
    if row["Nationality"] != "American":
        return False # If nationality is another
    return True

df["American Boomer"] = df.apply(American_Boomer, axis=1)
df["American Boomer"].sum()

1097