# Data Aggregation and Group Operations

## GroupBy Mechanics

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.704497,1.07552
1,a,two,-0.209005,0.009511
2,b,one,2.029897,-0.215239
3,b,two,-0.406385,1.138256
4,a,one,-0.007652,-0.827098


In [4]:
grouped = df['data1'].groupby(df['key1'])
# 按key1进行分组，并计算data1列的平均值
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001B9F5BA1400>

grouped变量现在是一个GroupBy对象，除了一些关于分组键df['key1']的一些中间数据之外，它实际上还没有任何操作

In [8]:
grouped.mean() # 求平均值

key1
a    0.162614
b    0.811756
Name: data1, dtype: float64

In [9]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.348423
      two    -0.209005
b     one     2.029897
      two    -0.406385
Name: data1, dtype: float64

In [10]:
means.unstack()
# 拆堆 将行中的数据透视到列

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.348423,-0.209005
b,2.029897,-0.406385


In [11]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -0.209005
            2006    2.029897
Ohio        2005    0.149056
            2006   -0.007652
Name: data1, dtype: float64

In [12]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.162614,0.085978
b,0.811756,0.461509


In [13]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.348423,0.124211
a,two,-0.209005,0.009511
b,one,2.029897,-0.215239
b,two,-0.406385,1.138256


默认情况下，所有的数值列都可以聚合

In [16]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 注意：分组键中的任何缺失值都将被排除在结果之外

### Iterating Over Groups

GroupBy对象支持迭代，会生成一个包含组名和数据块的2维元组序列

In [18]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.704497  1.075520
1    a  two -0.209005  0.009511
4    a  one -0.007652 -0.827098
b
  key1 key2     data1     data2
2    b  one  2.029897 -0.215239
3    b  two -0.406385  1.138256


In [20]:
# 在多个分组键的情况下，元组中的第一个元素是键值的元组
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.704497  1.075520
4    a  one -0.007652 -0.827098
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.209005  0.009511
('b', 'one')
  key1 key2     data1     data2
2    b  one  2.029897 -0.215239
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.406385  1.138256


In [21]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,2.029897,-0.215239
3,b,two,-0.406385,1.138256


In [22]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [24]:
grouped = df.groupby(df.dtypes, axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.704497  1.075520
1 -0.209005  0.009511
2  2.029897 -0.215239
3 -0.406385  1.138256
4 -0.007652 -0.827098
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
