In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"##last

# Data Aggregation and Group Operations

- Split a pandas object into pieces using one or more keys (in the form of functions,
arrays, or DataFrame column names)
- Calculate group summary statistics, like count, mean, or standard deviation, or a
user-defined function
- Apply within-group transformations or other manipulations, like normalization, linear regression, rank, or subset selection
- Compute pivot tables and cross-tabulations
- Perform quantile analysis and other statistical group analyses

###  GroupBy Mechanics

![](http://oydgk2hgw.bkt.clouddn.com/pydata-book/ikthz.png)

###  GroupBy Mechanics

- A list or array of values that is the same length as the axis being grouped
- A value indicating a column name in a DataFrame
- A dict or Series giving a correspondence between the values on the axis being
grouped and the group names
- A function to be invoked on the axis index or the individual labels in the index

###  GroupBy Mechanics: data generation

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'], 
                   'data1' : np.random.randn(5), 
                   'data2' : np.random.randn(5)})
df

In [None]:
dict1 = {'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'], 
                   'data1' : np.random.randn(5), 
                   'data2' : np.random.randn(5)}
df=pd.DataFrame(dict1,columns=['key1','key2','data1','data2'])
df

In [None]:
from collections import OrderedDict
dict=OrderedDict({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'], 
                   'data1' : np.random.randn(5), 
                   'data2' : np.random.randn(5)})
df=pd.DataFrame(dict)
df

###  GroupBy Mechanics: `groupby`

In [None]:
df
df.groupby(df['key1'])

In [None]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

In [None]:
df['data1'].groupby(df['key1'])
for name, group in df['data1'].groupby(df['key1']):
    print(name)
    print(group)

In [None]:
df[['data1','data2']].groupby(df['key1'])
for name, group in df[['data1','data2']].groupby(df['key1']):
    print(name)
    print(group)
    

In [None]:
df[['data1','data2']].groupby(df['key1']).mean()

###  GroupBy Mechanics: `groupby`: multiple

In [None]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means###  GroupBy Mechanics: `groupby`

df.groupby([df['key1'], df['key2']]).mean()

In [None]:
means.unstack()

###  GroupBy Mechanics: `groupby`: `np.array`

In [None]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [None]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [None]:
df
df['data1'].groupby([states, years]).mean()

In [None]:
df
df.groupby(['key1', 'key2']).size()



### Iterating Over Groups

In [None]:
df.groupby('key1')
for name, group in df.groupby('key1'):
    print(name)
    print(group)

In [None]:
df.groupby(['key1', 'key2'])

In [None]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

In [None]:
del dict

In [None]:
df
pieces=dict(list(df.groupby('key1')))

In [None]:
pieces

In [None]:
pieces['b']

In [None]:
df
df.dtypes

In [None]:
grouped = df.groupby(df.dtypes, axis=1)

In [None]:
for dtype, group in grouped:
    print(dtype)
    print(group)

###  Selecting a Column or Subset of Columns 


In [None]:
df
df['data2'].groupby([df['key1'],df['key2']])

In [None]:
df.groupby(['key1', 'key2'])[['data2']]##（Peter J. Landin:Syntactic sugar）
df.groupby(['key1', 'key2'])['data2']

In [None]:
df.groupby(['key1', 'key2'])[['data2']].mean()
df.groupby(['key1', 'key2'])['data2'].mean()

![](http://oydgk2hgw.bkt.clouddn.com/pydata-book/ikthz.png)