# 42. データをまとめるgroupby

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [3]:
dframe = DataFrame({'k1':['X', 'X', 'Y', 'Y', 'Z'],
                    'k2':['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.234081,-0.311962,X,alpha
1,-1.562918,-0.252104,X,beta
2,0.40891,0.048101,Y,alpha
3,-0.333584,-0.568942,Y,beta
4,-0.538969,-0.014327,Z,alpha


In [4]:
# k1をキーとして、データをグループにまとめる
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.SeriesGroupBy object at 0x113b484e0>

In [5]:
# グループごとの平均値を計算
group1.mean()

k1
X   -0.898499
Y    0.037663
Z   -0.538969
Name: dataset1, dtype: float64

In [6]:
dframe['dataset1'].groupby(dframe['k1']).mean()

k1
X   -0.898499
Y    0.037663
Z   -0.538969
Name: dataset1, dtype: float64

In [7]:
# キーは変えられる
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])
month = np.array(['JAN', 'FEB', 'JAN', 'FEB', 'JAN'])

In [8]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.234081,-0.311962,X,alpha
1,-1.562918,-0.252104,X,beta
2,0.40891,0.048101,Y,alpha
3,-0.333584,-0.568942,Y,beta
4,-0.538969,-0.014327,Z,alpha


In [9]:
# それぞれでグループ化する
dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB   -1.562918
    JAN    0.408910
NY  FEB   -0.333584
    JAN   -0.386525
Name: dataset1, dtype: float64

In [7]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.993228,0.324407
Y,-0.540088,-0.785704
Z,-1.449955,0.31173


In [8]:
# 複数の列名にも対応している
dframe.groupby(['k1', 'k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.3327,-0.087078
X,beta,0.653757,0.735893
Y,alpha,0.844662,-0.41684
Y,beta,-1.924838,-1.154567
Z,alpha,-1.449955,0.31173


In [9]:
# 列を限定することもできる
dataset2_group = dframe.groupby(['k1', 'k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.087078
X,beta,0.735893
Y,alpha,-0.41684
Y,beta,-1.154567
Z,alpha,0.31173


In [10]:
# size()と一緒に使うのも便利
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [11]:
# イテレート（繰り返し処理）ができる
for name, group in dframe.groupby('k1'):
    print('This is the {} group'.format(name))
    print(group)
    print('\n')

This is the X group
   dataset1  dataset2 k1     k2
0  1.332700 -0.087078  X  alpha
1  0.653757  0.735893  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.844662 -0.416840  Y  alpha
3 -1.924838 -1.154567  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -1.449955   0.31173  Z  alpha




In [12]:
# 複数のキーでも同じことができる
for(k1, k2), group in dframe.groupby(['k1', 'k2']):
    print('Key1 = {} Key2 = {}'.format(k1, k2))
    print(group)
    print('\n')

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0    1.3327 -0.087078  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1  0.653757  0.735893  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.844662  -0.41684  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3 -1.924838 -1.154567  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4 -1.449955   0.31173  Z  alpha




In [13]:
gr = dframe.groupby('k1')
gr.get_group('X')

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.3327,-0.087078,X,alpha
1,0.653757,0.735893,X,beta


In [14]:
# リストを作ってそれを辞書にすることもできる
group_dict = dict(list(dframe.groupby('k1')))
group_dict

{'X':    dataset1  dataset2 k1     k2
 0  1.332700 -0.087078  X  alpha
 1  0.653757  0.735893  X   beta, 'Y':    dataset1  dataset2 k1     k2
 2  0.844662 -0.416840  Y  alpha
 3 -1.924838 -1.154567  Y   beta, 'Z':    dataset1  dataset2 k1     k2
 4 -1.449955   0.31173  Z  alpha}

In [15]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.3327,-0.087078,X,alpha
1,0.653757,0.735893,X,beta


In [16]:
# 列方向（axis = 1）についても同じようなことができる
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  1.332700 -0.087078
 1  0.653757  0.735893
 2  0.844662 -0.416840
 3 -1.924838 -1.154567
 4 -1.449955  0.311730, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}