# Chapter 10. Data Aggregation and Group Operations

In [5]:
import pandas as pd
import numpy as np

In [6]:
df=pd.DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,2.131483,-0.370842,a,one
1,-0.521067,-1.068204,a,two
2,-1.419241,0.496776,b,one
3,-0.051054,0.14372,b,two
4,-1.398996,0.629351,a,one


In [7]:
grouped=df['data1'].groupby(df['key1'])

# このままだと、まだそれぞれの集計結果に対してどのような処理をすればよいのか与えられていない
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x1081f8748>

In [8]:
grouped.mean()

key1
a    0.070473
b   -0.735148
Name: data1, dtype: float64

In [9]:
# 複数のキーで集計
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.366243,-0.521067
b,-1.419241,-0.051054


In [10]:
#長ささえ同じならば、あらたに与えたarrayからでも集計可能
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states,years]).mean().unstack()

Unnamed: 0,2005,2006
California,-0.521067,-1.419241
Ohio,1.040214,-1.398996


In [11]:
# 同じDataFrameオブジェクトならば、以下のように省略可能
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.366243,0.129254
a,two,-0.521067,-1.068204
b,one,-1.419241,0.496776
b,two,-0.051054,0.14372


In [12]:
#GroupByはiteratonを装備
for (j,k) ,group in  df.groupby(['key1','key2']):
    print(j,k)
    print(group)

a one
      data1     data2 key1 key2
0  2.131483 -0.370842    a  one
4 -1.398996  0.629351    a  one
a two
      data1     data2 key1 key2
1 -0.521067 -1.068204    a  two
b one
      data1     data2 key1 key2
2 -1.419241  0.496776    b  one
b two
      data1    data2 key1 key2
3 -0.051054  0.14372    b  two


In [13]:
# dictに変換
dic=dict((list(df.groupby('key1'))))
dic['b']

Unnamed: 0,data1,data2,key1,key2
2,-1.419241,0.496776,b,one
3,-0.051054,0.14372,b,two


In [14]:
# column方向にtypeごとに集計

dict(list(df.groupby(df.dtypes,axis=1)))

{dtype('float64'):       data1     data2
 0  2.131483 -0.370842
 1 -0.521067 -1.068204
 2 -1.419241  0.496776
 3 -0.051054  0.143720
 4 -1.398996  0.629351, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [15]:
 people = pd.DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

people.iloc[2,3:5]=np.nan

# columnをmappingして集計
mapping = {'a': 'red', 'b': 'red', 'c': 'blue','d': 'blue', 'e': 'red', 'f' : 'orange'}
# countはsizeと違って、nanを含まない
people.groupby(mapping,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [16]:
# indexを何らかの関数で変換した値でgroupBy
people.groupby(len).count()

Unnamed: 0,a,b,c,d,e
3,3,3,3,2,2
5,1,1,1,1,1
6,1,1,1,1,1


In [43]:
# groupbyしたデータに任意の関数を与える
diff=lambda x:x.max()-x.min()

df=pd.DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})

df=df.set_index('key1')
df.groupby('key1',axis=0).agg(diff)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.885399,0.544122
b,2.364502,1.756367


In [69]:
# tipsについて、日ごとと、喫煙者かで変化があるか分析
tips=pd.read_csv('./examples/tips.csv')

tips['tips_pct']=tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tips_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [74]:
grouped=tips.groupby(['day','smoker'])
# 平均を取得
grouped.agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tips_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [81]:
# pctのみに焦点をあって、mean,std,最小最大の差を取得
grouped_pct=grouped['tips_pct']
max_min=lambda x:x.max()-x.min()
grouped_pct.agg(['mean','std',('min_max',max_min)])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min_max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [86]:
# columnごとに複数の関数を適用
funcs=['min','max','mean','count',('min_max',max_min)]
grouped['tips_pct','total_bill'].agg(funcs)

Unnamed: 0_level_0,Unnamed: 1_level_0,tips_pct,tips_pct,tips_pct,tips_pct,tips_pct,total_bill,total_bill,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,count,min_max,min,max,mean,count,min_max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Fri,No,0.120385,0.187735,0.15165,4,0.067349,12.46,22.75,18.42,4,10.29
Fri,Yes,0.103555,0.26348,0.174783,15,0.159925,5.75,40.17,16.813333,15,34.42
Sat,No,0.056797,0.29199,0.158048,45,0.235193,7.25,48.33,19.661778,45,41.08
Sat,Yes,0.035638,0.325733,0.147906,42,0.290095,3.07,50.81,21.276667,42,47.74
Sun,No,0.059447,0.252672,0.160113,57,0.193226,8.77,48.17,20.506667,57,39.4
Sun,Yes,0.06566,0.710345,0.18725,19,0.644685,7.25,45.35,24.12,19,38.1
Thur,No,0.072961,0.266312,0.160298,45,0.19335,7.51,41.19,17.113111,45,33.68
Thur,Yes,0.090014,0.241255,0.163863,17,0.15124,10.34,43.11,19.190588,17,32.77


In [91]:
# columnごとに別々の関数を適用
grouped.agg({'tips_pct':'mean','size':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tips_pct,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,4
Fri,Yes,0.174783,15
Sat,No,0.158048,45
Sat,Yes,0.147906,42
Sun,No,0.160113,57
Sun,Yes,0.18725,19
Thur,No,0.160298,45
Thur,Yes,0.163863,17


In [99]:
def top(df,n=5,column='tips_pct'):
    return df.sort_values(by=column)[-n:] 

# それぞれの集合から、上5つを取得
tips.groupby('smoker').apply(top,column='size')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tips_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,155,29.85,5.14,No,Sun,Dinner,5,0.172194
No,143,27.05,5.0,No,Thur,Lunch,6,0.184843
No,141,34.3,6.7,No,Thur,Lunch,6,0.195335
No,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,125,29.8,4.2,No,Thur,Lunch,6,0.14094
Yes,180,34.65,3.68,Yes,Sun,Dinner,4,0.106205
Yes,219,30.14,3.09,Yes,Sat,Dinner,4,0.102522
Yes,56,38.01,3.0,Yes,Sat,Dinner,4,0.078927
Yes,216,28.15,3.0,Yes,Sat,Dinner,5,0.106572
Yes,187,30.46,2.0,Yes,Sun,Dinner,5,0.06566


In [109]:
# それぞれの集合から復元無作為抽出
def sample(df,n=10):
     return df.sample(n,replace=True)
    
tips.groupby('smoker').apply(sample,n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tips_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,155,29.85,5.14,No,Sun,Dinner,5,0.172194
No,71,17.07,3.0,No,Sat,Dinner,3,0.175747
No,135,8.51,1.25,No,Thur,Lunch,2,0.146886
No,8,15.04,1.96,No,Sun,Dinner,2,0.130319
No,48,28.55,2.05,No,Sun,Dinner,3,0.071804
Yes,63,18.29,3.76,Yes,Sat,Dinner,4,0.205577
Yes,236,12.6,1.0,Yes,Sat,Dinner,2,0.079365
Yes,101,15.38,3.0,Yes,Fri,Dinner,2,0.195059
Yes,138,16.0,2.0,Yes,Thur,Lunch,2,0.125
Yes,107,25.21,4.29,Yes,Sat,Dinner,2,0.170171


In [None]:
# TODO 10.4 Pivot Tables and Cross-Tabulation