# Chapter 10: Data Aggregation and Group Operations

In [1]:
import numpy as np
import pandas as pd

## 10.1: How to Think About Group Operations

In [2]:
df=pd.DataFrame({"key1":["a","a",None,"b","b","a",None],
                 "key2":pd.Series([1,2,1,2,1,None,1],dtype="Int64"),
                 "data1":np.random.standard_normal(7),
                 "data2":np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.208484,-0.214905
1,a,2.0,-1.676413,0.661477
2,,1.0,-0.326257,0.373352
3,b,2.0,0.879961,0.501058
4,b,1.0,-0.85,-0.572475
5,a,,0.875807,0.49789
6,,1.0,-0.755993,-0.47719


In [3]:
grouped=df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001ACF58F0090>

In [4]:
grouped.mean()

key1
a   -0.197374
b    0.014980
Name: data1, dtype: float64

In [5]:
grouped.value_counts()

key1  data1    
a     -1.676413    1
       0.208484    1
       0.875807    1
b     -0.850000    1
       0.879961    1
Name: count, dtype: int64

In [6]:
means=df["data1"].groupby([df["key1"],df["key2"]]).mean()
means

key1  key2
a     1       0.208484
      2      -1.676413
b     1      -0.850000
      2       0.879961
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.208484,-1.676413
b,-0.85,0.879961


In [9]:
states=np.array(["OH","CA","CA","OH","OH","CA","OH"])
years=[2005,2005,2006,2005,2006,2005,2006]
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.208484,-0.214905
1,a,2.0,-1.676413,0.661477
2,,1.0,-0.326257,0.373352
3,b,2.0,0.879961,0.501058
4,b,1.0,-0.85,-0.572475
5,a,,0.875807,0.49789
6,,1.0,-0.755993,-0.47719


In [11]:
df["states"]=states
df["years"]=years
df

Unnamed: 0,key1,key2,data1,data2,states,years
0,a,1.0,0.208484,-0.214905,OH,2005
1,a,2.0,-1.676413,0.661477,CA,2005
2,,1.0,-0.326257,0.373352,CA,2006
3,b,2.0,0.879961,0.501058,OH,2005
4,b,1.0,-0.85,-0.572475,OH,2006
5,a,,0.875807,0.49789,CA,2005
6,,1.0,-0.755993,-0.47719,OH,2006


In [13]:
df1=df["data1"]
df1

0    0.208484
1   -1.676413
2   -0.326257
3    0.879961
4   -0.850000
5    0.875807
6   -0.755993
Name: data1, dtype: float64

In [15]:
df1.groupby([states,years]).mean()

CA  2005   -0.400303
    2006   -0.326257
OH  2005    0.544223
    2006   -0.802997
Name: data1, dtype: float64

In [21]:
df=df.drop(columns=["states","years"])

In [22]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.208484,-0.214905
1,a,2.0,-1.676413,0.661477
2,,1.0,-0.326257,0.373352
3,b,2.0,0.879961,0.501058
4,b,1.0,-0.85,-0.572475
5,a,,0.875807,0.49789
6,,1.0,-0.755993,-0.47719


In [23]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.197374,0.314821
b,1.5,0.01498,-0.035709


In [25]:
df.groupby("key2").mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.430942,-0.222804
2,-0.398226,0.581267


In [26]:
df.groupby(["key1","key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.208484,-0.214905
a,2,-1.676413,0.661477
b,1,-0.85,-0.572475
b,2,0.879961,0.501058


In [28]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.208484,-0.214905
1,a,2.0,-1.676413,0.661477
2,,1.0,-0.326257,0.373352
3,b,2.0,0.879961,0.501058
4,b,1.0,-0.85,-0.572475
5,a,,0.875807,0.49789
6,,1.0,-0.755993,-0.47719


In [27]:
df.groupby(["key1","key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [29]:
df.groupby(["key1","key2"],dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [30]:
df.groupby("key1").count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [31]:
df.groupby("key1").size()

key1
a    3
b    2
dtype: int64

### Iterating over Groups

In [32]:
for name,group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1  0.208484 -0.214905
1    a     2 -1.676413  0.661477
5    a  <NA>  0.875807  0.497890
b
  key1  key2     data1     data2
3    b     2  0.879961  0.501058
4    b     1 -0.850000 -0.572475


In [36]:
for (k1,k2),group in df.groupby(["key1","key2"]):
    print(k1)
    print(k2)
    print(group)

a
1
  key1  key2     data1     data2
0    a     1  0.208484 -0.214905
a
2
  key1  key2     data1     data2
1    a     2 -1.676413  0.661477
b
1
  key1  key2  data1     data2
4    b     1  -0.85 -0.572475
b
2
  key1  key2     data1     data2
3    b     2  0.879961  0.501058


In [37]:
pieces={name:group for name,group in df.groupby("key1")}
pieces

{'a':   key1  key2     data1     data2
 0    a     1  0.208484 -0.214905
 1    a     2 -1.676413  0.661477
 5    a  <NA>  0.875807  0.497890,
 'b':   key1  key2     data1     data2
 3    b     2  0.879961  0.501058
 4    b     1 -0.850000 -0.572475}

In [38]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,0.879961,0.501058
4,b,1,-0.85,-0.572475
