In [2]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt

In [1]:
# apply take a function and "applies" across each row or column of a dataframe
# it is similar to writing a for loop across each row or column and calling the function

In [2]:
def my_sq(x) : 
    return x ** 2

In [3]:
def avg_2(x,y) : 
    return (x + y) / 2

In [4]:
import pandas as pd 

df = pd.DataFrame({"a" : [10,20,30], "b" : [20,30,40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [5]:
# say we want to square the value of a column and we do not want to use a function to square the value, we can do as follows : 
print(df["a"] ** 2)

0    100
1    400
2    900
Name: a, dtype: int64


# APPLY OVER A SERIES 

In [6]:
type(df["a"])

pandas.core.series.Series

In [7]:
# the Series object has a method called apply which allow us to pass the function that we want to apply across each element in the Series 

sq = df["a"].apply(my_sq) # this will create a brand new variable
print(sq)

# df["sq"] = df["a"].apply(my_sq) # this will will create a new column in the existing dataframe 

0    100
1    400
2    900
Name: a, dtype: int64


In [8]:
# noticed that there is no round bracket when we pass the function, i.e. v.s. ....apply(my_sq())
# suppose we have another function that takes in two parameters as follwows : 
def my_exp(x,y): 
    return x**y

In [9]:
# df["sq3"] = df["a"].apply(my_exp, y = 3) # this will will create a new column in the existing dataframe 

testing_1 = df["a"].apply(my_exp, y =3)

# the first parameter in the function will take the values in the Series as the first agruement  
# we only need to define the second parameter using the keyword in the function

# APPLY OVER A DATAFRAME

In [10]:
# dataframes typically have two or more dimensions, therefore when we uses a apply function over a dataframe, we need to specify which axis (columns vs rows) to apply the function

In [11]:
# if we want the apply function to work column-wise, we can pass the axis=0 parameter 
# and if we want the apply function to work row-wise, we can pass the axis=1 parameter
def print_me(x):
    print(x)

# Column-wise operation

In [12]:
# Column-wise operation
x = df.apply(print_me, axis = 0) # axis=0 dictate that the operation is a column-wise operation
print(x)
# the entire column(2 columns in total plus the headers) is passed into the first agrument of the function 

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64
a    None
b    None
dtype: object


In [13]:
def average_over_3(a,b,c): 
    return (a + b + c) / 3

In [21]:
# this will generate an error ---> df.apply(average_over_3)

# the reason is becasue we failed to pass in "b" and "c" agruments. the fucntion takes in 3 agruments 
# when we use apply, the entire column is passed into the first agruments , i.e. a

# we first need to create a function that define the columns where the function will retreive the arugements from, as follows : 
def avg3_apply_col(col) : 
    a = col[0] # this looks at the first element in the column
    b = col[1] # this looks at the second element in the column
    c = col[2] # this looks at the third element in the column
    return (a+b+c) / 3

In [22]:
df.apply(avg3_apply_col)

a    10.000000
b    16.666667
dtype: float64

# Row-wise operation

In [16]:
# once again, this will generate an error --> df.apply(avg3_apply_col, axis = 1)

In [20]:
df.apply(avg3_apply_col, axis = 1) # to calculate row-level average

# the main issue here is the "index out of bound"
# note that we only have three rows of data and in each row, we have two column items 
# when we tried to get index 2, which refer to the third element, it does not exist


IndexError: ('index out of bounds', 'occurred at index 0')

In [24]:
# if we want the row average, we have to create a new function as follows : 
def row_average_2(row) : 
    a = row[0]
    b = row[1]
    return ( a + b ) / 2

In [25]:
df.apply(row_average_2)

a    15.0
b    25.0
dtype: float64

# Apply (More Advanced)

In [5]:
import seaborn as sns
titanic = sns.load_dataset("titanic")
titanic.info()
# the columns (age and deck) have missing values
# we can write functions to find out the percentage of missing values 
# there is a total of 891 entries  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [1]:
import numpy as np 

In [2]:
def count_missing_values(vec): 
    x = pd.isnull(vec)  # note that isnull is a PANDAS function 
    null_count = np.sum(x)

    return null_count

In [3]:
def proportion_missing(vec):
    dem = vec.size # this will give the total number of row entries in the vec 
    num = count_missing_values(vec)

    return num / dem * 100

In [4]:
def proportion_filled_values(vec): 
    return 100 - proportion_missing(vec)

# Column-wise operation 


In [None]:
# lets use our newly created function on each column of our data

In [73]:
miss_values = titanic.apply(count_missing_values)
print(miss_values)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [74]:
pro_miss_values = titanic.apply(proportion_missing)
print(pro_miss_values)

survived        0.000000
pclass          0.000000
sex             0.000000
age            19.865320
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.224467
class           0.000000
who             0.000000
adult_male      0.000000
deck           77.216611
embark_town     0.224467
alive           0.000000
alone           0.000000
dtype: float64


In [76]:
pro_fill_values = titanic.apply(proportion_filled_values)
print(pro_fill_values)

survived       100.000000
pclass         100.000000
sex            100.000000
age             80.134680
sibsp          100.000000
parch          100.000000
fare           100.000000
embarked        99.775533
class          100.000000
who            100.000000
adult_male     100.000000
deck            22.783389
embark_town     99.775533
alive          100.000000
alone          100.000000
dtype: float64


In [79]:
# we can use the information above to determine if a variable is worth analyzing 
# if a particular column has a high number of missing values, we can not use it for analysis

# Row-wise operation

In [None]:
# Since our functions are vectorized, we can apply them across the rows of data withough changing them 

In [8]:
miss_values_row = titanic.apply(count_missing_values, axis = 1) # axis = 1 is at the row level
print(miss_values_row.head())

0    1
1    0
2    1
3    0
4    1
dtype: int64


In [83]:
pro_miss_values_row = titanic.apply(proportion_missing, axis = 1)
print(pro_miss_values_row.head())

0    6.666667
1    0.000000
2    6.666667
3    0.000000
4    6.666667
dtype: float64


In [85]:
pro_fill_values_row = titanic.apply(proportion_filled_values, axis = 1)
print(pro_fill_values_row.head())

0     93.333333
1    100.000000
2     93.333333
3    100.000000
4     93.333333
dtype: float64


In [15]:
# since we are using apply at the row level, we can create a column containing these values
titanic["num_missing"] = titanic.apply(count_missing_values, axis = 1)
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,1
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,2
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,0
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False,1
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False,1
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False,1


In [18]:
rows_with_missing_entries = titanic.loc[titanic.num_missing > 1,:]
len(rows_with_missing_entries)

160

In [19]:
rows_with_missing_entries.sample(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
589,0,3,male,,0,0,8.05,S,Third,man,True,,Southampton,no,True,2
738,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True,2
260,0,3,male,,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,2
667,0,3,male,,0,0,7.775,S,Third,man,True,,Southampton,no,True,2
573,1,3,female,,0,0,7.75,Q,Third,woman,False,,Queenstown,yes,True,2


# Vectorized Function

In [20]:
# Vectorized Function

# as shown above, sometimes we have to rewrite our function when we want to apply it when the entire column or row was passed into the first parameter of the function
# however, the above may not be feasible in certain sitution
# we can use vectorize function and decorator to vectorize any function

df = pd.DataFrame({"a" : [10,20,30], "b" : [20,30,40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [21]:
def average_over_2(x,y):
    return (x+y) / 2

In [29]:
def average_over_2_row(row) :
    a = row[0]
    b = row[1]
    return (a+b) / 2

In [32]:
df.apply(average_over_2)

TypeError: ("average_over_2() missing 1 required positional argument: 'y'", 'occurred at index a')

In [31]:
df.apply(average_over_2_row, axis = 1)

0    15.0
1    25.0
2    35.0
dtype: float64

In [33]:
def average_over_2(x,y):
    return (x+y) / 2

In [34]:
# note that we want to apply the function at the row level

average_over_2(df["a"], df["b"])
# for a vectorized function, we want to able to pass in a vector of values for x and a vector of values for y
# the actual calculation within our function are inherently vectorized
# that is if we add two columns together, pandas and numpy will automatically perform element wise additions

0    15.0
1    25.0
2    35.0
dtype: float64

In [35]:
import numpy as np

In [37]:
def average2mod(x,y):
    if (x == 20): # if x = 20, return NaN
        return (np.NaN)
    else : 
        return (x + y) / 2

In [38]:
average2mod(df["a"], df["b"])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [41]:
average2mod(20,20) # if we pass it individual number instead of a vector, it will work as expected

nan

In [42]:
average2mod(10,25)

17.5

## Vectorized Function - using numpy

In [43]:
# for the above error - "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."

# we want to change our function above such that when it is given a vector of values, it will perform the calculation in an element wise mananer
# we can use the vectorize function from numpy which will create a new function

average2mod_vectorized = np.vectorize(average2mod) # this create a new function 
average2mod_vectorized(df["a"], df["b"])

array([15., nan, 35.])

In [None]:
# the above method works well when you do not have the souce code for a function
# we can use a Python decorator to "automatically" vectorize the function without creating a new function
# Decorators are "functions" that take another function as input and modify how that function's output behaves

# to use the vectorize decorator, we have to use the @ before our function definition

In [46]:
@np.vectorize
def average2mod(x,y):
    if x == 20: # if x = 20, return NaN
        return np.NaN
    else : 
        return (x + y) / 2

In [47]:
average2mod(df["a"], df["b"])

array([15., nan, 35.])

## Vectorized Function - using numba

In [None]:
# the numba library is used to optimize Python code esp calculations on arrays performing math calculation

In [7]:
import numba
@numba.vectorize

def average_2_numba(x,y):
    if int(x) == 20: # we now have to add type information to our function
        return np.NaN
    else:
        return (x+y) / 2

In [8]:
average_2_numba(df["a"], df["b"])
# numba does not understand Pandas object and hence the error below

ValueError: [1mcannot determine Numba type of <class 'pandas.core.series.Series'>[0m

In [9]:
average_2_numba(df["a"].values, df["b"].values)

array([15., nan, 35.])

# Lambda Function

In [10]:
import re
import os

In [13]:
os.getcwd()

'c:\\Users\\tanzh\\Documents\\Python\\Pandas for Everyone'

In [14]:
os.listdir(os.getcwd())

['c10a_groupby_operation_large_dataset.ipynb',
 'c10_groupby_operation.ipynb',
 'c11_datetime_data_type.ipynb',
 'c1_pandas_dataframe_basic.ipynb',
 'c3_intro_to_plotting.ipynb',
 'c4_data_assembly.ipynb',
 'c9_apply.ipynb',
 'datasets']

In [16]:
docs  = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\doctors.csv")
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 1 columns):
William Hartnell (1963-66)    12 non-null object
dtypes: object(1)
memory usage: 224.0+ bytes


In [19]:
docs # we want to extract just the name, i.e. "Patrick Troughton" from "Patrick Troughton (1966-69)"

Unnamed: 0,William Hartnell (1963-66)
0,Patrick Troughton (1966-69)
1,Jon Pertwee (1970 74)
2,Tom Baker (1974-81)
3,Peter Davison (1982-84)
4,Colin Baker (1984-86)
5,Sylvester McCoy (1987-89)
6,Paul McGann (1996)
7,Christopher Eccleston (2005)
8,David Tennant (2005-10)
9,Matt Smith (2010-13)


In [23]:
pattern = re.compile(r"\w+\s+\w+")

In [24]:
def get_names(strin):
    return pattern.match(strin).group()

In [27]:
docs["name"] = docs["William Hartnell (1963-66)"].apply()

In [29]:
docs

Unnamed: 0,William Hartnell (1963-66),name
0,Patrick Troughton (1966-69),Patrick Troughton
1,Jon Pertwee (1970 74),Jon Pertwee
2,Tom Baker (1974-81),Tom Baker
3,Peter Davison (1982-84),Peter Davison
4,Colin Baker (1984-86),Colin Baker
5,Sylvester McCoy (1987-89),Sylvester McCoy
6,Paul McGann (1996),Paul McGann
7,Christopher Eccleston (2005),Christopher Eccleston
8,David Tennant (2005-10),David Tennant
9,Matt Smith (2010-13),Matt Smith


In [None]:
# the function above is a simple one-liner
# we can choose to write the above directly in the apply function. this method is called using the lambda function

In [31]:
docs["name_lambda"] = docs["William Hartnell (1963-66)"].apply(lambda x : pattern.match(x).group())

In [32]:
docs

Unnamed: 0,William Hartnell (1963-66),name,name_lambda
0,Patrick Troughton (1966-69),Patrick Troughton,Patrick Troughton
1,Jon Pertwee (1970 74),Jon Pertwee,Jon Pertwee
2,Tom Baker (1974-81),Tom Baker,Tom Baker
3,Peter Davison (1982-84),Peter Davison,Peter Davison
4,Colin Baker (1984-86),Colin Baker,Colin Baker
5,Sylvester McCoy (1987-89),Sylvester McCoy,Sylvester McCoy
6,Paul McGann (1996),Paul McGann,Paul McGann
7,Christopher Eccleston (2005),Christopher Eccleston,Christopher Eccleston
8,David Tennant (2005-10),David Tennant,David Tennant
9,Matt Smith (2010-13),Matt Smith,Matt Smith
