In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta
2,-0.602619,0.517953,Y,alpha
3,0.675927,1.028757,Y,beta
4,-1.215498,0.013589,Z,alpha


In [3]:
# k1をキーとして、'dataset1'に対して、データをグループにまとめます。
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1  # groupby という新しいオブジェクトができる。

<pandas.core.groupby.SeriesGroupBy object at 0x10d392b00>

In [4]:
#グループごとの平均値を計算
group1.mean()

k1
X   -0.538807
Y    0.036654
Z   -1.215498
Name: dataset1, dtype: float64

In [5]:
# k1をキーとして、データをグループにまとめます。  dframeについても同じようにグループでまとめられる
group1 = dframe.groupby(dframe['k1'])
group1  # groupby という新しいオブジェクトができる。

<pandas.core.groupby.DataFrameGroupBy object at 0x10d392a58>

In [6]:
#グループごとの平均値を計算
group1.mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.538807,0.419084
Y,0.036654,0.773355
Z,-1.215498,0.013589


In [7]:
# キーは変えられます。 元のデータにない列名でもgroupbyは使える。　
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])
#  それぞれでグループ化します。
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB   -1.031153
    JAN   -0.602619
NY  FEB    0.675927
    JAN   -0.630980
Name: dataset1, dtype: float64

In [8]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta
2,-0.602619,0.517953,Y,alpha
3,0.675927,1.028757,Y,beta
4,-1.215498,0.013589,Z,alpha


In [9]:
dframe.groupby('k1').mean() # 平均の場合、列k2は数字でないので無視される。(平均で、足したり割ったりするので)

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.538807,0.419084
Y,0.036654,0.773355
Z,-1.215498,0.013589


In [10]:
dframe.groupby('k1').min()

Unnamed: 0_level_0,dataset1,dataset2,k2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
X,-1.031153,0.124382,alpha
Y,-0.602619,0.517953,alpha
Z,-1.215498,0.013589,alpha


In [11]:
#dataframeに戻って
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta
2,-0.602619,0.517953,Y,alpha
3,0.675927,1.028757,Y,beta
4,-1.215498,0.013589,Z,alpha


In [12]:
# 複数の列名にも対応しています。 今の場合変化なし.
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.046462,0.713786
X,beta,-1.031153,0.124382
Y,alpha,-0.602619,0.517953
Y,beta,0.675927,1.028757
Z,alpha,-1.215498,0.013589


In [13]:
# 列を限定することもできます。
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]  #[[列名]]　としないと、Dataframeにならない。
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.713786
X,beta,0.124382
Y,alpha,0.517953
Y,beta,1.028757
Z,alpha,0.013589


In [14]:
# typeはDataFrame
dataset2_group = dframe.groupby(['k1','k2'])['dataset2']
                                        
print(type(dataset2_group.mean()))
dataset2_group.mean()

<class 'pandas.core.series.Series'>


k1  k2   
X   alpha    0.713786
    beta     0.124382
Y   alpha    0.517953
    beta     1.028757
Z   alpha    0.013589
Name: dataset2, dtype: float64

In [15]:
# size()と一緒に使うのも便利です。
dframe.groupby(['k1']).size()  #k1 でまとめた時、要素がそれぞれ何個あるかわかる

k1
X    2
Y    2
Z    1
dtype: int64

In [16]:
for name,group in  dframe.groupby('k1'):
    print(group)

   dataset1  dataset2 k1     k2
0 -0.046462  0.713786  X  alpha
1 -1.031153  0.124382  X   beta
   dataset1  dataset2 k1     k2
2 -0.602619  0.517953  Y  alpha
3  0.675927  1.028757  Y   beta
   dataset1  dataset2 k1     k2
4 -1.215498  0.013589  Z  alpha


In [17]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta
2,-0.602619,0.517953,Y,alpha
3,0.675927,1.028757,Y,beta
4,-1.215498,0.013589,Z,alpha


In [18]:
# イテレート（繰り返し処理）ができます。
for name,group in dframe.groupby('k1'):
    print('This is the {} group'.format(name))
    print(group)
    print('\n')

This is the X group
   dataset1  dataset2 k1     k2
0 -0.046462  0.713786  X  alpha
1 -1.031153  0.124382  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2 -0.602619  0.517953  Y  alpha
3  0.675927  1.028757  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -1.215498  0.013589  Z  alpha




In [19]:
# 複数のキーでも同じ事ができます。
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print('Key1 = {} Key2 = {}'.format(k1,k2))
    print(group)
    print('\n')

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0 -0.046462  0.713786  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1 -1.031153  0.124382  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2 -0.602619  0.517953  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3  0.675927  1.028757  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4 -1.215498  0.013589  Z  alpha




In [20]:
gr = dframe.groupby('k1')
gr.get_group('X')  # Xのみ取り出したDataframeができる

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta


In [21]:
# リストを作ってそれを辞書にするこもできます。
group_dict = dict(list(dframe.groupby('k1')))
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta


In [22]:
# 列方向（axis = 1）についても同じような事ができます。
# ちょっと複雑ですが、  ???
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.046462  0.713786
 1 -1.031153  0.124382
 2 -0.602619  0.517953
 3  0.675927  1.028757
 4 -1.215498  0.013589, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [23]:
dframe.dtypes

dataset1    float64
dataset2    float64
k1           object
k2           object
dtype: object

In [24]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.046462,0.713786,X,alpha
1,-1.031153,0.124382,X,beta
2,-0.602619,0.517953,Y,alpha
3,0.675927,1.028757,Y,beta
4,-1.215498,0.013589,Z,alpha
