### 第10章 数据聚合与分组操作

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({
    "key1": list("aabba"),
    "key2": ["one", "two", "one", "two", "one"],
    "data1": np.random.randn(5),
    "data2": np.random.randn(5)
})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.251237,-1.208609
1,a,two,-1.537341,0.448268
2,b,one,-1.036146,-1.196194
3,b,two,-0.179119,-1.409893
4,a,one,-0.293362,1.499399


In [4]:
df.groupby("key1").mean()["data1"]

key1
a   -1.027313
b   -0.607632
Name: data1, dtype: float64

In [5]:
grouped = df["data1"].groupby(df["key1"])

In [6]:
grouped.mean()

key1
a   -1.027313
b   -0.607632
Name: data1, dtype: float64

In [7]:
df[["data1", "key1"]].groupby("key1").mean()

Unnamed: 0_level_0,data1
key1,Unnamed: 1_level_1
a,-1.027313
b,-0.607632


In [8]:
df[["data1", "key1", "key2"]].groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,one,-0.772299
a,two,-1.537341
b,one,-1.036146
b,two,-0.179119


In [9]:
states = np.array(["Ohio", "California", "California", "Ohio", "Ohio"])
years = np.array([2005, 2005, 2006, 2005, 2006])
df["data1"].groupby([states, years]).mean()

California  2005   -1.537341
            2006   -1.036146
Ohio        2005   -0.715178
            2006   -0.293362
Name: data1, dtype: float64

In [10]:
df.groupby("key1").mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.027313,0.246353
b,-0.607632,-1.303044


In [11]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.772299,0.145395
a,two,-1.537341,0.448268
b,one,-1.036146,-1.196194
b,two,-0.179119,-1.409893


In [12]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [13]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -1.251237 -1.208609
1    a  two -1.537341  0.448268
4    a  one -0.293362  1.499399
b
  key1 key2     data1     data2
2    b  one -1.036146 -1.196194
3    b  two -0.179119 -1.409893


In [14]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.251237 -1.208609
4    a  one -0.293362  1.499399
('a', 'two')
  key1 key2     data1     data2
1    a  two -1.537341  0.448268
('b', 'one')
  key1 key2     data1     data2
2    b  one -1.036146 -1.196194
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.179119 -1.409893


In [15]:
pieces = dict(list(df.groupby("key1")))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -1.251237 -1.208609
 1    a  two -1.537341  0.448268
 4    a  one -0.293362  1.499399,
 'b':   key1 key2     data1     data2
 2    b  one -1.036146 -1.196194
 3    b  two -0.179119 -1.409893}

In [16]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.036146,-1.196194
3,b,two,-0.179119,-1.409893


In [17]:
grouped = df.groupby(df.dtypes, axis=1)

In [18]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -1.251237 -1.208609
1 -1.537341  0.448268
2 -1.036146 -1.196194
3 -0.179119 -1.409893
4 -0.293362  1.499399
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [19]:
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.145395
a,two,0.448268
b,one,-1.196194
b,two,-1.409893


In [20]:
df.groupby(["key1", "key2"])["data2"].mean()

key1  key2
a     one     0.145395
      two     0.448268
b     one    -1.196194
      two    -1.409893
Name: data2, dtype: float64

In [21]:
people = pd.DataFrame(
    np.random.randn(5, 5),
    columns=list("abcde"),
    index=["Joe", "Steve", "Wes", "Jim", "Travis"]
)
people

Unnamed: 0,a,b,c,d,e
Joe,0.355705,-0.211367,1.194198,0.327536,0.164548
Steve,1.360309,-0.597234,0.288239,-1.569766,-1.429067
Wes,-2.095056,-1.347036,0.302078,0.779054,1.659323
Jim,0.121122,2.16983,-0.084619,-0.127617,-1.22473
Travis,0.535225,0.4589,0.024262,-0.756233,-0.387592


In [22]:
people.iloc[2:3, [1, 2]] = np.nan

In [23]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.355705,-0.211367,1.194198,0.327536,0.164548
Steve,1.360309,-0.597234,0.288239,-1.569766,-1.429067
Wes,-2.095056,,,0.779054,1.659323
Jim,0.121122,2.16983,-0.084619,-0.127617,-1.22473
Travis,0.535225,0.4589,0.024262,-0.756233,-0.387592


In [24]:
mapping = {"a": "red", "b": "red", "c": "blue", "d": "blue", "e": "red", "f": "orange"}
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,1.521734,0.308885
Steve,-1.281527,-0.665991
Wes,0.779054,-0.435733
Jim,-0.212236,1.066222
Travis,-0.731971,0.606532


In [25]:
people[["a", "b", "e"]].sum(axis=1)

Joe       0.308885
Steve    -0.665991
Wes      -0.435733
Jim       1.066222
Travis    0.606532
dtype: float64

In [26]:
people[["c", "d"]].sum(axis=1)

Joe       1.521734
Steve    -1.281527
Wes       0.779054
Jim      -0.212236
Travis   -0.731971
dtype: float64

In [27]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.355705,-0.211367,1.194198,0.327536,0.164548
Steve,1.360309,-0.597234,0.288239,-1.569766,-1.429067
Wes,-2.095056,,,0.779054,1.659323
Jim,0.121122,2.16983,-0.084619,-0.127617,-1.22473
Travis,0.535225,0.4589,0.024262,-0.756233,-0.387592


#### 作为分组键传递的函数将会按照每个索引值调用一次

In [28]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.618229,1.958463,1.109579,0.978974,0.59914
5,1.360309,-0.597234,0.288239,-1.569766,-1.429067
6,0.535225,0.4589,0.024262,-0.756233,-0.387592


In [29]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"], [1, 3, 5, 1, 3]], names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.156745,-1.031241,-0.203553,-0.057705,1.018399
1,-1.6685,-0.466027,-1.679498,-0.466487,-1.735326
2,-0.986787,-0.493242,-0.411344,0.922328,-1.199064
3,-0.161357,0.707019,1.621828,1.533917,0.972897


In [30]:
hier_df.groupby(level="cty", axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [31]:
df.groupby("key1")["data1"].quantile(0.9)

key1
a   -0.484937
b   -0.264822
Name: data1, dtype: float64

In [32]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [33]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.251237,-1.208609
1,a,two,-1.537341,0.448268
2,b,one,-1.036146,-1.196194
3,b,two,-0.179119,-1.409893
4,a,one,-0.293362,1.499399


In [34]:
df.groupby("key1").agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.243979,2.708007
b,0.857027,0.213699


In [35]:
frame = pd.DataFrame({
    "data1": np.random.randn(1000),
    "data2": np.random.randn(1000)
})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0    (-0.333, 1.464]
1    (-2.13, -0.333]
2    (-0.333, 1.464]
3    (-2.13, -0.333]
4    (-0.333, 1.464]
5    (-2.13, -0.333]
6    (-0.333, 1.464]
7    (-0.333, 1.464]
8    (-2.13, -0.333]
9    (-0.333, 1.464]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.935, -2.13] < (-2.13, -0.333] < (-0.333, 1.464] < (1.464, 3.261]]

In [36]:
def get_stats(group):
    return {
        "min": group.min(),
        "max": group.max(),
        "count": group.count(),
        "mean": group.mean()
    }

In [37]:
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.935, -2.13]",-1.679227,1.01098,16.0,-0.48057
"(-2.13, -0.333]",-2.592704,2.853411,367.0,0.038712
"(-0.333, 1.464]",-2.943826,2.931508,541.0,0.025181
"(1.464, 3.261]",-2.289574,2.547624,76.0,0.12111


In [38]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.036799
2         NaN
3   -0.881619
4         NaN
5   -0.362674
dtype: float64

In [39]:
s.fillna(s.mean())

0   -0.402498
1    0.036799
2   -0.402498
3   -0.881619
4   -0.402498
5   -0.362674
dtype: float64

In [40]:
suits = ["H", "S", "C", "D"]
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "Q", "K"]
cards = []
for suit in ["H", "S", "C", "D"]:
    cards.extend(str(num) + suit for num in base_names)
deck = pd.Series(card_val, index=cards)
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
QH     10
KH     10
dtype: int64

In [41]:
def draw(deck, n=5):
    return deck.sample(n)

In [42]:
draw(deck)

6D     6
5C     5
7S     7
AS     1
JD    10
dtype: int64

In [43]:
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2)

C  3C      3
   8C      8
D  9D      9
   8D      8
H  5H      5
   10H    10
S  4S      4
   2S      2
dtype: int64

In [44]:
df = pd.DataFrame({
    "category": list("aaaabbbb"),
    "data": np.random.randn(8),
    "weights": np.random.rand(8)
})
df

Unnamed: 0,category,data,weights
0,a,-0.235304,0.25516
1,a,0.500191,0.009074
2,a,0.648482,0.346287
3,a,0.377568,0.053035
4,b,-0.757522,0.063279
5,b,-0.744632,0.593314
6,b,0.607651,0.221431
7,b,-1.233,0.684613


In [45]:
df.groupby("category").apply(lambda g: np.average(g["data"], weights=g["weights"]))

category
a    0.284955
b   -0.767492
dtype: float64