# pandas groupby & aggregation

In [3]:
import pandas as pd

sales = pd.DataFrame(
    {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139, 237, 326, 456],
        'butter': [20, 45, 70, 98]
    }
)

sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


### Groupby and count

In [2]:
sales.groupby('weekday').count()

Unnamed: 0_level_0,bread,butter,city
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [1]:
# Aggregation functions:

# mean()
# std()
# sum()
# first(), last() # first or last row in the group
# min(), max()

Total amount of bread sold on each day

In [4]:
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

In [5]:
sales.groupby('weekday').bread.sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

Total amount of bread and butter

In [6]:
sales.groupby('weekday')[['bread', 'butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,376,65


Groupby by multi-index

In [7]:
sales.groupby(['city', 'weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


Lets make a series: Customers, which show who made the purchases in our sales DF

In [8]:
customers = pd.Series(['Dave', 'Alice', 'Bob', 'Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

Now lets group by customers to see how much each customer bought. Notice 'Alice' is in the list twice but her purchases are summed.

In [9]:
sales.groupby(customers).bread.sum()

Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

## Categorical data

To get unique values in each column or index:

In [10]:
sales.weekday.unique()

array(['Sun', 'Mon'], dtype=object)

To see how many times each entry occured in the column use value_counts() method

In [11]:
sales.weekday.value_counts()

Change the type of the data to 'category' with .astype() method

In [12]:
sales.weekday = sales.weekday.astype('category')

sales.weekday

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

Creating categories uses less memory and speeds up groupby() operations

### Titanic dataset

In [4]:
filepath = 'csv_files/titanic.csv'

titanic = pd.read_csv(filepath, delimiter='\t')

titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [14]:
titanic.columns = titanic.columns.str.lower() # lets mark the column labels with lower cases

In [15]:
titanic.head(2)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [16]:
titanic.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [17]:
titanic.pclass.unique() # there were 3 different classes on Titanic

array([3, 1, 2])

Lets find how many people from each class survived

In [18]:
surv_by_class = titanic.groupby('pclass')['survived'].count()
surv_by_class

pclass
1    30
2    30
3    96
Name: survived, dtype: int64

In [19]:
titanic.embarked.unique() # three different ports where people came from:
                        # S: England, C: France, Q: Ireland

array(['S', 'C', 'Q', nan], dtype=object)

Lets find the number of survivors from each port people came from

In [20]:
surv_by_port_class = titanic.groupby(['embarked', 'pclass']).survived.count()
surv_by_port_class

embarked  pclass
C         1         12
          2          4
          3         16
Q         3         13
S         1         17
          2         26
          3         67
Name: survived, dtype: int64

In [21]:
oldest = titanic.age.max()
youngest = titanic.age.min()

print('The oldest person on Titanic was %d and youngest %d years old.' % (oldest, youngest))

The oldest person on Titanic was 71 and youngest 0 years old.


### Groupy and aggregation

Maximum purchase in either city of bread and butter. This is a single aggregation.

In [22]:
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


One can do multiple aggregations with '.agg()' method:

In [23]:
sales.groupby('city')[['bread', 'butter']].agg(['max', 'sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,465,70,90
Dallas,456,693,98,143


### Custom aggregation

In [24]:
def data_range(series):
    return series.max() - series.min()

sales.groupby('weekday')[['bread', 'butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,98,25


We can use dicitonaries as well

In [25]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread': 'sum', 'butter': data_range})

Unnamed: 0,bread,butter
Alice,693,53
Bob,326,0
Dave,139,0


In [26]:
aggregated = titanic.groupby('pclass')[['age', 'fare']].agg(['max', 'median'])
aggregated

Unnamed: 0_level_0,age,age,fare,fare
Unnamed: 0_level_1,max,median,max,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,71.0,38.0,263.0,61.6792
2,66.0,29.0,73.5,21.0
3,70.5,22.0,56.4958,8.05


In [27]:
aggregated.loc[:, ('fare', 'median')]

pclass
1    61.6792
2    21.0000
3     8.0500
Name: (fare, median), dtype: float64

### Convert datetime to string

In [5]:
path = 'csv_files/aapl.csv'

aapl_df = pd.read_csv(path, usecols=['Date', 'Close', 'High'], index_col='Date', parse_dates=True)

aapl_df.head(3)

Unnamed: 0_level_0,High,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-10-14,116.4,104.08
2008-10-13,110.53,110.26
2008-10-10,100.0,96.8


transform the index datetime values to abbreviated days of the week with '.strftime('%a')' method

In [29]:
by_day = aapl_df.groupby(aapl_df.index.strftime('%a'))

units_sum = by_day['High'].sum()

units_sum

Fri    10192.27
Mon     9731.65
Thu    10224.55
Tue    10150.78
Wed    10414.92
Name: High, dtype: float64

### Groupby and filtering

In [6]:
file_path = 'csv_files/mpg.csv'

df_auto = pd.read_csv(file_path)

df_auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


#### Mean MPG by year

In [35]:
df_auto.groupby('model_year')['mpg'].mean()

model_year
70    17.689655
71    21.250000
72    18.714286
73    17.100000
74    22.703704
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.696552
81    30.334483
82    31.709677
Name: mpg, dtype: float64

What if we want the yearly average only for cars built by Chervolet? To do this, we need to filter the groups before aggregating:

In [36]:
# we save the output of gtoupy() with splitting before aggregating
splitting = df_auto.groupby('model_year')

type(splitting)

pandas.core.groupby.DataFrameGroupBy

In [37]:
type(splitting.groups)

dict

In [38]:
splitting.groups.keys()

# the keys are the years and values are the corresponding rows

dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

In [41]:
# we can iterate over the splitting object and carry out computations

for group_name, group in splitting:
    avg = group['mpg'].mean()
    print(group_name, avg)

70 17.689655172413794
71 21.25
72 18.714285714285715
73 17.1
74 22.703703703703702
75 20.266666666666666
76 21.573529411764707
77 23.375
78 24.061111111111114
79 25.09310344827585
80 33.69655172413793
81 30.334482758620695
82 31.70967741935484


In [42]:
# we can do the same thing now only for Chevrolet

for group_name, group in splitting:
    avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    print(group_name, avg)

70 15.666666666666666
71 20.25
72 15.333333333333334
73 14.833333333333334
74 18.666666666666668
75 17.666666666666668
76 23.25
77 20.25
78 23.233333333333334
79 21.666666666666668
80 30.05
81 23.5
82 29.0


We can write the same thing as dictionary comperhension:

In [43]:
chevy_means = {year: group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean() 
               for year, group in splitting}
pd.Series(chevy_means)

70    15.666667
71    20.250000
72    15.333333
73    14.833333
74    18.666667
75    17.666667
76    23.250000
77    20.250000
78    23.233333
79    21.666667
80    30.050000
81    23.500000
82    29.000000
dtype: float64

#### One to all Boolean comparison

In [46]:
chevy = df_auto['name'].str.contains('chevrolet')

df_auto.groupby(['model_year', chevy])['mpg'].mean()

# this produces boolean column that is true if each automobile is manufactured by chevrolet
# we do a multi-level groupby by model year and chevy. Chaning after the mean() method will
# produces a yearly average mpg rating for chervolet and for all automobiles. We see that 
# Chevrolet produced worse 'mpg' rating for cars each year than its competitors, except in 1976.

model_year  name 
70          False    17.923077
            True     15.666667
71          False    21.416667
            True     20.250000
72          False    19.120000
            True     15.333333
73          False    17.500000
            True     14.833333
74          False    23.208333
            True     18.666667
75          False    20.555556
            True     17.666667
76          False    21.350000
            True     23.250000
77          False    23.895833
            True     20.250000
78          False    24.136364
            True     23.233333
79          False    25.488462
            True     21.666667
80          False    33.966667
            True     30.050000
81          False    30.578571
            True     23.500000
82          False    32.111111
            True     29.000000
Name: mpg, dtype: float64

### apply() method

By using .apply(), you can write functions that filter rows within groups. The .apply() method will handle the iteration over individual groups and then re-combine them back into a Series or DataFrame.

In [47]:
titanic.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In this exercise you'll take the Titanic data set and analyze survival rates from the 'C' deck, which contained the most passengers. To do this you'll group the dataset by 'sex' and then use the .apply() method on a provided user defined function which calculates the mean survival rates on the 'C' deck:

In [52]:
# create a function that calculates the mean survival rates on the 'C' deck
def c_deck_survival(gr):
    
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    
    return gr.loc[c_passengers, 'survived'].mean()

In [53]:
# creating the grouby object by_sex
by_sex = titanic.groupby('sex')

by_sex.apply(c_deck_survival)

sex
female    1.0
male      0.2
dtype: float64

#### Grouping and filtering with .filter()

You can use groupby with the .filter() method to remove whole groups of rows from a DataFrame based on a boolean condition.

In [54]:
sales.head()

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,237,45,Dallas,Sun
2,326,70,Austin,Mon
3,456,98,Dallas,Mon
