In [1]:
import pandas
titanic = pandas.read_csv('titanic.csv')

In [2]:
# apply a standard aggregation operation such as computing the mean
titanic.groupby('Sex')['Survived'].agg('mean')

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [3]:
# We can also create our own custom aggregation function that takes as input a group of values to summarize
# Initially we just return 42 as a dummy value and use a print statement to better understand the shape of data passed to our function for each gender
def survival_percentage(group) :
    print(group.name, type(group), group.shape)
    return 42

In [4]:
# Apply our custom aggregation function to the Survived column, called separately for each  gender
# Notice how our function gets called twice, once for the female group with 314 rows and once for the male group with 577 rows.
titanic.groupby('Sex')['Survived'].agg(['mean', survival_percentage])

Survived <class 'pandas.core.series.Series'> (314,)
Survived <class 'pandas.core.series.Series'> (577,)


Unnamed: 0_level_0,mean,survival_percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.742038,42
male,0.188908,42


In [5]:
# we now change our custom aggregation function to compute a more sensible survival percentage by counting the number of survivors
def survival_percentage(group) :
    survivors = 0
    for v in group :
        if (v == 1) :
            survivors += 1
    return survivors / len(group)

In [6]:
titanic.groupby('Sex')['Survived'].agg(['mean', survival_percentage])

Unnamed: 0_level_0,mean,survival_percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.742038,0.742038
male,0.188908,0.188908


In [7]:
# Modify our function to add a percentage character on to the end of the result displayed to 1 decimal place
def survival_percentage(group) :
    survivors = 0
    for v in group :
        if (v == 1) :
            survivors += 1
    return f"{100 * survivors / len(group):.1f}%"

In [8]:
titanic.groupby('Sex')['Survived'].agg(['mean', survival_percentage])

Unnamed: 0_level_0,mean,survival_percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.742038,74.2%
male,0.188908,18.9%


In [9]:
# Generally we should use a for loop as a last resort when processing Pandas data frames.
# We should instead use filtering and the len function to compute the number of survivors
def survival_percentage(group) :
    survivors = len(group[group == 1])
    return f"{100 * survivors / len(group):.1f}%"

In [10]:
titanic.groupby('Sex')['Survived'].agg(['mean', survival_percentage])

Unnamed: 0_level_0,mean,survival_percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.742038,74.2%
male,0.188908,18.9%


In [11]:
# Next we are going to apply a new custom function to each group, but this time our function will receive all columns from which to compute a result
def foo(group) :
    print(group.name, type(group), group.shape)
    return 42

In [12]:
# Notice how our function gets called twice, once for the female group with 314 rows and 12 columns and again for the male group with 577 rows and 12 columns
titanic.groupby('Sex').apply(foo)

female <class 'pandas.core.frame.DataFrame'> (314, 12)
male <class 'pandas.core.frame.DataFrame'> (577, 12)


Sex
female    42
male      42
dtype: int64

In [13]:
# We can then access any of the columns to perform our custom calculations, for example to compute the mean of the Age column
def foo(group) :
    return group.Age.mean()

In [14]:
titanic.groupby('Sex').apply(foo)

Sex
female    27.915709
male      30.726645
dtype: float64

In [15]:
# Our  function can also return a row containing multiple values, and we can assign name to each column.
# In this simple example we always return 12 as the value for column P and 45 for column Q
def foo(group) :
    return pandas.Series([12, 45], index=['P', 'Q'])

In [16]:
titanic.groupby('Sex').apply(foo)

Unnamed: 0_level_0,P,Q
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,12,45
male,12,45


In [17]:
# More sensibly the values returned would be some kind of calculation, for example the mean age and survival rate
def foo(group) :
    return pandas.Series([group.Age.mean(), group.Survived.mean()], index=['avgAge', 'survived'])

In [18]:
titanic.groupby('Sex').apply(foo)

Unnamed: 0_level_0,avgAge,survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,27.915709,0.742038
male,30.726645,0.188908


In [19]:
# The primary reason we use the .apply method rather than the .agg method is that with .apply our function can assess multiple columns (e.g. Age and Survived in the following example)
def foo(group) :
    return pandas.Series([group.Age.mean() * group.Survived.mean()], index=['sfsff'])

In [20]:
titanic.groupby('Sex').apply(foo)

Unnamed: 0_level_0,sfsff
Sex,Unnamed: 1_level_1
female,20.714523
male,5.804513


In [21]:
# The custom function can also return multiple rows (as well as multiple columns)
# For example if our function returned the group of rows and columns passed to it unchanged as the result, then the apply function will also return the original data frame unchanged
def foo(group) :
    return group

In [22]:
# Of course, this would be more interesting if our function returned a group that has been modified or processed in some way
titanic.groupby('Sex').apply(foo)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [23]:
# Try creating some examples of your own ...