## Data Aggregation and Group Operations


In [1]:
import pandas as pd
import numpy as np

### GroupBy Mechanics

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-2.508055,-0.52035
1,a,two,-0.121357,-0.741173
2,b,one,-0.380002,0.268688
3,b,two,-0.133894,0.13175
4,a,one,0.960146,-0.533307


In [3]:
grouped = df['data1'].groupby(by=df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000216CAF479A0>

In [4]:
grouped.sum()

key1
a   -1.669266
b   -0.513896
Name: data1, dtype: float64

In [5]:
# GroupBy 객체의 groups 속성을 이용하면 내부를 알 수 있다.
grouped.groups

{'a': [0, 1, 4], 'b': [2, 3]}

In [6]:
df['data1'].groupby([df['key1'], df['key2']]).sum()

key1  key2
a     one    -1.547909
      two    -0.121357
b     one    -0.380002
      two    -0.133894
Name: data1, dtype: float64

In [7]:
# 데이터프레임에서 groupby 메소드를 호출할 때는 by=의 값으로 열이름에 대한 리스트를 지정해도 된다.
df.groupby(by=['key1']).sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.669266,-1.794831
b,-0.513896,0.400437


In [8]:
df.groupby(by=['key1', 'key2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.547909,-1.053657
a,two,-0.121357,-0.741173
b,one,-0.380002,0.268688
b,two,-0.133894,0.13175


각 그룹의 갯수를 세기 위해서는 size() 또는 count() 메소드를 사용하면 된다. count()는 NaN을 세지 않는다.

In [9]:
grouped.size()

key1
a    3
b    2
Name: data1, dtype: int64

In [10]:
grouped.count()

key1
a    3
b    2
Name: data1, dtype: int64

### Iterating over groups

In [11]:
for name, data in df.groupby('key1'):
    print(name)
    print(data)

a
  key1 key2     data1     data2
0    a  one -2.508055 -0.520350
1    a  two -0.121357 -0.741173
4    a  one  0.960146 -0.533307
b
  key1 key2     data1     data2
2    b  one -0.380002  0.268688
3    b  two -0.133894  0.131750


In [12]:
for name, data in df.groupby(['key1','key2']):
    print(name)
    print(data)

('a', 'one')
  key1 key2     data1     data2
0    a  one -2.508055 -0.520350
4    a  one  0.960146 -0.533307
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.121357 -0.741173
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.380002  0.268688
('b', 'two')
  key1 key2     data1    data2
3    b  two -0.133894  0.13175


In [13]:
pieces = dict(list(df.groupby('key1')))

In [14]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -2.508055 -0.520350
 1    a  two -0.121357 -0.741173
 4    a  one  0.960146 -0.533307,
 'b':   key1 key2     data1     data2
 2    b  one -0.380002  0.268688
 3    b  two -0.133894  0.131750}

In [15]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,-2.508055,-0.52035
1,a,two,-0.121357,-0.741173
4,a,one,0.960146,-0.533307


In [16]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [17]:
df.groupby(df.dtypes, axis=1).sum()

Unnamed: 0,float64,object
0,-3.028406,aone
1,-0.86253,atwo
2,-0.111315,bone
3,-0.002144,btwo
4,0.42684,aone


### Selecting a Column or Subset of Columns

In [18]:
df.groupby('key1')['data1'].sum()

key1
a   -1.669266
b   -0.513896
Name: data1, dtype: float64

In [19]:
df.groupby(['key1','key2'])['data2'].sum()

key1  key2
a     one    -1.053657
      two    -0.741173
b     one     0.268688
      two     0.131750
Name: data2, dtype: float64

### Grouping with Dicts and Series

In [20]:

people = pd.DataFrame(np.random.randn(5, 5),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.761292,-0.473329,2.334285,0.824671,-0.382821
Steve,1.896075,0.700537,0.080597,0.412084,0.648624
Wes,-0.397528,,,-1.963447,1.116773
Jim,0.458076,0.403308,0.344357,0.361666,-0.763974
Travis,-0.586113,0.360399,-1.364111,0.443751,0.211214


In [21]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [22]:
people.groupby(mapping, axis=1).sum()

Unnamed: 0,blue,red
Joe,3.158955,-0.094858
Steve,0.492681,3.245236
Wes,-1.963447,0.719245
Jim,0.706022,0.097411
Travis,-0.92036,-0.0145


In [23]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [24]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### Grouping with Functions

In [25]:
# len(Joe) = len(Wes) = len(Jim) = 3 으로 같은 그룹에 속하게 된다.
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.821841,-0.070021,2.678641,-0.777111,-0.030021
5,1.896075,0.700537,0.080597,0.412084,0.648624
6,-0.586113,0.360399,-1.364111,0.443751,0.211214


In [26]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [27]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.397528,-0.473329,2.334285,-1.963447,-0.382821
3,two,0.458076,0.403308,0.344357,0.361666,-0.763974
5,one,1.896075,0.700537,0.080597,0.412084,0.648624
6,two,-0.586113,0.360399,-1.364111,0.443751,0.211214


### Grouping by Index Levels

In [28]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.749986,-1.778943,0.95787,-0.07871,-1.975036
1,1.705323,-0.301071,0.72,-0.617921,0.201318
2,-0.159259,0.033003,1.302284,-0.260636,0.647113
3,-0.994025,-0.982363,-0.867692,-0.369924,1.08556


In [29]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 자료 집계

In [30]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-2.508055,-0.52035
1,a,two,-0.121357,-0.741173
2,b,one,-0.380002,0.268688
3,b,two,-0.133894,0.13175
4,a,one,0.960146,-0.533307


In [31]:
grouped = df.groupby('key1')
grouped.quantile(0.9)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.743846,-0.522942
b,-0.158505,0.254994


In [33]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [34]:
# 자신이 만든 함수를 사용하려면 aggregate 또는 agg 메소드를 이용한다. 사용자 집계함수는 배열을 인자로 갖고 반환값은 스칼라가 되게 만든다.
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.468202,0.220823
b,0.246108,0.136938


In [35]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.556422,1.774561,-2.508055,-1.314706,-0.121357,0.419395,0.960146,3.0,-0.598277,0.123922,-0.741173,-0.63724,-0.533307,-0.526829,-0.52035
b,2.0,-0.256948,0.174025,-0.380002,-0.318475,-0.256948,-0.195421,-0.133894,2.0,0.200219,0.09683,0.13175,0.165984,0.200219,0.234453,0.268688


In [36]:
grouped.agg('mean')

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.556422,-0.598277
b,-0.256948,0.200219


In [37]:
grouped.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,mean,std,peak_to_peak,mean,std,peak_to_peak
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,-0.556422,1.774561,3.468202,-0.598277,0.123922,0.220823
b,-0.256948,0.174025,0.246108,0.200219,0.09683,0.136938


In [38]:
grouped.agg([('평균', 'mean'), ('표준편차', np.std)])

Unnamed: 0_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,평균,표준편차,평균,표준편차
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,-0.556422,1.774561,-0.598277,0.123922
b,-0.256948,0.174025,0.200219,0.09683


In [39]:
grouped.agg([('평균', 'mean'), ('표준편차', 'std')])

Unnamed: 0_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,평균,표준편차,평균,표준편차
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,-0.556422,1.774561,-0.598277,0.123922
b,-0.256948,0.174025,0.200219,0.09683
