# 9장 데이터 수집과 그룹 연산
## GroupBy 메카닉

In [2]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [9]:
df = DataFrame({'key1': list('aabba'),
               'key2': ['one', 'two', 'one', 'two', 'one'],
               'data1': np.random.randn(5),
               'data2': np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.397912,-0.128698,a,one
1,-1.639724,0.435665,a,two
2,0.869065,0.200615,b,one
3,-0.338114,-0.065233,b,two
4,0.66973,-0.493238,a,one


In [6]:
grouped = df['data1'].groupby([df['key1']])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x000001AC1E67F908>

- 위의 결과는 groupby 객체이며 아래와 같이 추가 메서드 호출 시 자세한 내용 설명 가능

In [7]:
grouped.mean()

key1
a    0.451667
b   -0.513408
Name: data1, dtype: float64

In [11]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.135909
      two    -1.639724
b     one     0.869065
      two    -0.338114
Name: data1, dtype: float64

In [12]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.135909,-1.639724
b,0.869065,-0.338114


- 길이만 같다면 어떤 배열이든 unstack() 가능

In [14]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -1.639724
            2006    0.869065
Ohio        2005   -0.368013
            2006    0.669730
Name: data1, dtype: float64

- 칼럼 이름을 다른 객체에서 가져와 색인으로 사용 가능

In [15]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.455969,-0.06209
b,0.265475,0.067691


In [16]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.135909,-0.310968
a,two,-1.639724,0.435665
b,one,0.869065,0.200615
b,two,-0.338114,-0.065233


- groupby는 숫자 데이터만 처리하며, *성가신 칼럼*은 결과에서 제외
___
## 1. 그룹 간 순회하기

In [18]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.397912 -0.128698    a  one
1 -1.639724  0.435665    a  two
4  0.669730 -0.493238    a  one
b
      data1     data2 key1 key2
2  0.869065  0.200615    b  one
3 -0.338114 -0.065233    b  two


- GroupBy객체는 이터레이션 지원
- 그룹 이름과 하위 데이터 묶음을 튜플로 반환

In [19]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1,k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.397912 -0.128698    a  one
4  0.669730 -0.493238    a  one
('a', 'two')
      data1     data2 key1 key2
1 -1.639724  0.435665    a  two
('b', 'one')
      data1     data2 key1 key2
2  0.869065  0.200615    b  one
('b', 'two')
      data1     data2 key1 key2
3 -0.338114 -0.065233    b  two


In [23]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,0.869065,0.200615,b,one
3,-0.338114,-0.065233,b,two


- 원하는 데이터만 filtering 가능

In [24]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [26]:
grouped = df.groupby(df.dtypes, axis = 1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.397912 -0.128698
 1 -1.639724  0.435665
 2  0.869065  0.200615
 3 -0.338114 -0.065233
 4  0.669730 -0.493238, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

- 열 단위 GroupBy도 가능
___
## 2. 칼럼 또는 칼럼의 일부만 선택

In [28]:
df.groupby('key1')['data1']

<pandas.core.groupby.SeriesGroupBy object at 0x000001AC20485978>

In [29]:
df['data1'].groupby(df['key1'])

<pandas.core.groupby.SeriesGroupBy object at 0x000001AC20485D68>

- 위의 두 코드는 같은 결과 반환, 문법적 관용 허용

In [30]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.310968
a,two,0.435665
b,one,0.200615
b,two,-0.065233


- 위 데이터에서(df) data2에대한 groupby 평균값을 원한다면 상기와 같이 사용

In [31]:
s_grouped = df.groupby(['key1', 'key2'])[['data2']]
s_grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.310968
a,two,0.435665
b,one,0.200615
b,two,-0.065233


___
## 3. 사전과 Series에서 묶기

In [40]:
people = DataFrame(np.random.randn(5, 5),
                  columns = list('abcde'),
                  index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [37]:
people.iloc[2:3, [1,2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,1.618293,1.182157,-0.336658,0.003557,-0.970711
Steve,0.229644,0.803782,-1.193983,1.179402,0.333846
Wes,0.160467,,,-0.145195,-0.043508
Jim,-1.007668,-1.517826,0.692266,-0.70733,-0.517383
Travis,0.377253,2.443255,-0.604192,-0.587289,0.854308


In [43]:
mapping = {'a': 'red', 'b': 'red', 'c':'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.groupby(mapping, axis =1)
by_column.mean()

Unnamed: 0,blue,red
Joe,-1.470722,0.750006
Steve,0.518831,-0.364434
Wes,-0.363444,0.084709
Jim,0.362093,0.768711
Travis,-1.713729,0.336255


- np.nan은 제외하고 계산
- groupby 안에 dictionary 형식 데이터 추가해서 다른 색인으로 groupby 가능

In [41]:
map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [42]:
people.groupby(map_series, axis = 1).mean()

Unnamed: 0,blue,red
Joe,-1.470722,0.750006
Steve,0.518831,-0.364434
Wes,-0.363444,0.084709
Jim,0.362093,0.768711
Travis,-1.713729,0.336255


- Dictionary 외 Series 객체에 대해서도 같은 기능 수행
___
## 4. 함수로 묶기

In [44]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.399858,0.632682,-2.757285,-0.186862,2.777738
5,-0.758414,0.327546,0.189782,0.847881,-0.662433
6,-0.130436,0.281324,-3.609484,0.182026,0.857878


In [47]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.430254,0.468955,-2.02954,-0.911904,2.211317
Steve,-0.758414,0.327546,0.189782,0.847881,-0.662433
Wes,0.120106,-0.096854,-0.811897,0.085008,0.230874
Jim,1.710006,0.260581,0.084152,0.640034,0.335546
Travis,-0.130436,0.281324,-3.609484,0.182026,0.857878


- 이름 길이 별로 묶기 위해 간단하게 함수 사용 가능

In [48]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.430254,-0.096854,-2.02954,-0.911904,0.230874
3,two,1.710006,0.260581,0.084152,0.640034,0.335546
5,one,-0.758414,0.327546,0.189782,0.847881,-0.662433
6,two,-0.130436,0.281324,-3.609484,0.182026,0.857878


- 내부적으로 모주 배열로 변환되기에 함수와 배려, 사전과 Series를 혼용 가능
___
## 5. 색인 단계로 묶기

In [49]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], 
                                    [1, 3 ,5, 1 ,3]], names = ['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4,5), columns = columns)

In [50]:
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

In [51]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.103148,2.137226,1.07194,1.281704,-0.529814
1,-0.182017,0.313248,0.152995,-0.965229,0.863048
2,0.028293,-0.514668,-2.703645,0.175814,0.088749
3,-1.725205,-0.303504,-0.513614,0.455627,-0.198956


In [53]:
hier_df.groupby(level = 'cty', axis = 1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


- 위와 같이 multi index를 활용 Groupby 메서드 사용 가능