# GroupBy机制

In [2]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two','one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.21776,0.650552
1,a,two,-0.588297,0.070366
2,b,one,0.96271,-0.868605
3,b,two,0.211041,-1.399342
4,a,one,0.88461,0.663995


In [5]:
# 按key1进行分组，并计算data1列的平均值
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000158A05FE208>

In [7]:
# 计算分组平均值
grouped.mean()

key1
a    0.171358
b    0.586875
Name: data1, dtype: float64

In [9]:
# 一次性传入多个数组的列表
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.551185
      two    -0.588297
b     one     0.962710
      two     0.211041
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.551185,-0.588297
b,0.96271,0.211041


In [11]:
states = np.array(['Ohio', 'California', 'California','Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -0.588297
            2006    0.962710
Ohio        2005    0.214400
            2006    0.884610
Name: data1, dtype: float64

In [12]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.171358,0.461638
b,0.586875,-1.133974


In [13]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.551185,0.657274
a,two,-0.588297,0.070366
b,one,0.96271,-0.868605
b,two,0.211041,-1.399342


In [14]:
# 运用size方法，返回含有分组大小的Series
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

# 对分组进行迭代

In [17]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.217760  0.650552
1    a  two -0.588297  0.070366
4    a  one  0.884610  0.663995
b
  key1 key2     data1     data2
2    b  one  0.962710 -0.868605
3    b  two  0.211041 -1.399342


In [18]:
# 对于多重键的情况，元组的第一个元素将会是由键值组成的元组
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2    data1     data2
0    a  one  0.21776  0.650552
4    a  one  0.88461  0.663995
a two
  key1 key2     data1     data2
1    a  two -0.588297  0.070366
b one
  key1 key2    data1     data2
2    b  one  0.96271 -0.868605
b two
  key1 key2     data1     data2
3    b  two  0.211041 -1.399342


In [20]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.96271,-0.868605
3,b,two,0.211041,-1.399342


In [23]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [25]:
for dtype,group in grouped:
    print(dtype)
    print(group)

a
0    0.217760
1   -0.588297
4    0.884610
Name: data1, dtype: float64
b
2    0.962710
3    0.211041
Name: data1, dtype: float64


# 选取一列或列的子集

In [26]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.657274
a,two,0.070366
b,one,-0.868605
b,two,-1.399342


In [28]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000158A92DBB00>

In [29]:
s_grouped.mean()

key1  key2
a     one     0.657274
      two     0.070366
b     one    -0.868605
      two    -1.399342
Name: data2, dtype: float64

# 通过字典或Series进行分组

In [30]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim','Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.22908,0.753119,-1.537064,-0.486319,0.759713
Steve,-0.059265,1.18051,-0.424677,-0.390812,-0.972683
Wes,-1.462562,0.432191,0.988734,0.40391,0.748574
Jim,-0.024812,0.533575,-0.936013,-0.848656,1.566048
Travis,-1.355622,0.636246,0.678161,0.796273,-2.008008


In [33]:
people.iloc[2:3,[1,3]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.22908,0.753119,-1.537064,-0.486319,0.759713
Steve,-0.059265,1.18051,-0.424677,-0.390812,-0.972683
Wes,-1.462562,,,,0.748574
Jim,-0.024812,0.533575,-0.936013,-0.848656,1.566048
Travis,-1.355622,0.636246,0.678161,0.796273,-2.008008


In [35]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [37]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.023383,1.283752
Steve,-0.815489,0.148563
Wes,0.0,-0.713988
Jim,-1.784669,2.074812
Travis,1.474434,-2.727384


In [38]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [39]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,0,2
Jim,2,3
Travis,2,3


# 通过函数进行分组

In [40]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.716454,1.286694,-2.473077,-1.334975,3.074335
5,-0.059265,1.18051,-0.424677,-0.390812,-0.972683
6,-1.355622,0.636246,0.678161,0.796273,-2.008008
