In [53]:
#####################################################################
# Examples below are (more or less) taken from 
# “Python for Data Analysis" by William Wesley McKinney (O’Reilly).
# Copyright 2012 William McKinney, 978-1-449-31979-3.
#
# TOPICS COVERED:
# - GroupBy
# - Data Aggregation
# - Group-wise Operations & Transformations
# - Pivot Tables & Cross-Tabulation
#####################################################################

# This notebook is written in Python 3.

from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA

## GroupBy

In [54]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.40403,2.050861,a,one
1,0.212485,-0.451916,a,two
2,-2.221443,0.158235,b,one
3,1.05987,-0.729253,b,two
4,0.337107,-0.280594,a,one


In [55]:
# compute the mean of the data1 column using group labels from 'key1'
grouped = df['data1'].groupby(df['key1'])  # create a GroupBy object
grouped.mean()

key1
a    0.048521
b   -0.580787
Name: data1, dtype: float64

In [56]:
# if we pass a list of keys instead, result in a hierarchical index
df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one    -0.033461
      two     0.212485
b     one    -2.221443
      two     1.059870
Name: data1, dtype: float64

In [57]:
# compute the mean of ALL columns possible columns using group labels from 'key1'
print(df.groupby('key1').mean())
print(df.groupby('key1').size())  # compute group sizes

         data1     data2
key1                    
a     0.048521  0.439450
b    -0.580787 -0.285509
key1
a    3
b    2
dtype: int64


In [58]:
# can iterate over groups
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -0.404030  2.050861    a  one
4  0.337107 -0.280594    a  one
a two
      data1     data2 key1 key2
1  0.212485 -0.451916    a  two
b one
      data1     data2 key1 key2
2 -2.221443  0.158235    b  one
b two
     data1     data2 key1 key2
3  1.05987 -0.729253    b  two


## Data Aggregation

In [59]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.40403,2.050861,a,one
1,0.212485,-0.451916,a,two
2,-2.221443,0.158235,b,one
3,1.05987,-0.729253,b,two
4,0.337107,-0.280594,a,one


In [60]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)  # return 90th percentile

key1
a    0.312183
b    0.731738
Name: data1, dtype: float64

In [61]:
# Can pass in your own aggregation function
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.741137,2.502776
b,3.281313,0.887488


In [62]:
# describe function works with groups
print(grouped.describe())

               data1     data2
key1                          
a    count  3.000000  3.000000
     mean   0.048521  0.439450
     std    0.396843  1.398149
     min   -0.404030 -0.451916
     25%   -0.095772 -0.366255
     50%    0.212485 -0.280594
     75%    0.274796  0.885134
     max    0.337107  2.050861
b    count  2.000000  2.000000
     mean  -0.580787 -0.285509
     std    2.320239  0.627548
     min   -2.221443 -0.729253
     25%   -1.401115 -0.507381
     50%   -0.580787 -0.285509
     75%    0.239541 -0.063637
     max    1.059870  0.158235


In [63]:
tips = pd.read_csv("Datasets/tips.csv")
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [64]:
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']  # pull out tip_pct column

# passing a list of functions results in a DataFrame
grouped_pct.agg(['mean', 'std', peak_to_peak])

# can also change the column names
grouped_pct.agg([('mean', 'mean'), ('std dev', 'std'), ('range', peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std dev,range
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [65]:
# Aggregation can be done over more than 1 column
functions = ['mean', 'max']
grouped['tip_pct', 'total_bill'].agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.252672,18.105185,35.83
Female,Yes,0.18215,0.416667,17.977879,44.3
Male,No,0.160669,0.29199,19.791237,48.33
Male,Yes,0.152771,0.710345,22.2845,50.81


In [66]:
# Can specify which function should apply to which column
grouped.agg({'tip' : ['min', 'max'], 'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,sum
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,No,1.0,5.2,140
Female,Yes,1.0,6.5,74
Male,No,1.25,9.0,263
Male,Yes,1.0,10.0,150


## Group-wise Operations & Transformations

In [67]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = NA # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-1.009083,1.679127,-0.403593,0.724266,0.920267
Steve,1.41922,0.304574,-0.130188,1.226571,-0.176429
Wes,0.63002,,,0.401222,0.570864
Jim,-0.611941,-0.042945,-1.070722,-1.689273,-0.43795
Travis,-0.612303,0.063867,0.245224,0.883025,-0.423519


In [68]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.330455,0.871497,-0.079185,0.669504,0.355871
two,0.40364,0.130815,-0.600455,-0.231351,-0.307189


In [69]:
# transform applies a function to each group, then places the results in
# the appropriate locations.
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,-0.330455,0.871497,-0.079185,0.669504,0.355871
Steve,0.40364,0.130815,-0.600455,-0.231351,-0.307189
Wes,-0.330455,0.871497,-0.079185,0.669504,0.355871
Jim,0.40364,0.130815,-0.600455,-0.231351,-0.307189
Travis,-0.330455,0.871497,-0.079185,0.669504,0.355871


In [70]:
# Back to tipping example, say we want the top 5 tip_pct values by group
def top(df, n=5, column='tip_pct'):
    return df.sort_index(by=column)[-n:]

top(tips)

  app.launch_new_instance()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [71]:
# top is called on each group of the DataFrame, and then the results
# are glued together with pd.concat.
tips.groupby('smoker').apply(top)

  app.launch_new_instance()


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [72]:
# if the function passed to apply has other keywords, they can be passed as such:
tips.groupby(['smoker', 'day']).apply(top,
                                      n=1, column='total_bill')

  app.launch_new_instance()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


## Pivot Tables & Cross-Tabulation

In [73]:
# create a pivot table
# if not specified, the aggregation function is mean
tips.pivot_table(index=['sex', 'smoker'])  # index sets the hierarchical index

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2.592593,2.773519,0.156921,18.105185
Female,Yes,2.242424,2.931515,0.18215,17.977879
Male,No,2.71134,3.113402,0.160669,19.791237
Male,Yes,2.5,3.051167,0.152771,22.2845


In [74]:
# only want tip_pct & size
# want sex & day as hierarchial index, smoke in column
tips.pivot_table(['tip_pct', 'size'],
                 index=['sex', 'day'],
                 columns='smoker',
                 margins=True)  # to include marginal statistics

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,size,size,size
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,Fri,0.165296,0.209129,0.199388,2.5,2.0,2.111111
Female,Sat,0.147993,0.163817,0.15647,2.307692,2.2,2.25
Female,Sun,0.16571,0.237075,0.181569,3.071429,2.5,2.944444
Female,Thur,0.155971,0.163073,0.157525,2.48,2.428571,2.46875
Male,Fri,0.138005,0.14473,0.143385,2.0,2.125,2.1
Male,Sat,0.162132,0.139067,0.151577,2.65625,2.62963,2.644068
Male,Sun,0.158291,0.173964,0.162344,2.883721,2.6,2.810345
Male,Thur,0.165706,0.164417,0.165276,2.5,2.3,2.433333
All,,0.159328,0.163196,0.160803,2.668874,2.408602,2.569672


In [75]:
# Use aggfunc to define a different aggregation function
tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
                 aggfunc=len)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2,13,14,25
Female,Yes,7,15,4,7
Male,No,2,32,43,20
Male,Yes,8,27,15,10


In [76]:
print(tips.pivot_table('size', index=['time', 'sex', 'smoker'], columns='day',
                 aggfunc='sum'))
print(tips.pivot_table('size', index=['time', 'sex', 'smoker'], columns='day',
                 aggfunc='sum',
                 fill_value=0))  # can include a fill value for NAs

day                   Fri  Sat  Sun  Thur
time   sex    smoker                     
Dinner Female No        2   30   43     2
              Yes       8   33   10   NaN
       Male   No        4   85  124   NaN
              Yes      12   71   39   NaN
Lunch  Female No        3  NaN  NaN    60
              Yes       6  NaN  NaN    17
       Male   No      NaN  NaN  NaN    50
              Yes       5  NaN  NaN    23
day                   Fri  Sat  Sun  Thur
time   sex    smoker                     
Dinner Female No        2   30   43     2
              Yes       8   33   10     0
       Male   No        4   85  124     0
              Yes      12   71   39     0
Lunch  Female No        3    0    0    60
              Yes       6    0    0    17
       Male   No        0    0    0    50
              Yes       5    0    0    23


In [77]:
# Cross-tabulation
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
