In [1]:
import pandas as pd

# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)

## Example 1: U.S. Congress Dataset

The dataset contains members’ first and last names, birth date, gender, type ("rep" for House of Representatives or "sen" for Senate), U.S. state, and political party. You can use df.tail() to vie the last few rows of the dataset:

In [4]:
dtypes = {
    "first_name": "category",
    "gender": "category",
    "type": "category",
    "state": "category",
    "party": "category",
}

In [5]:
df = pd.read_csv(
    "groupby-data/legislators-historical.csv",
    dtype=dtypes,
    usecols=list(dtypes) + ["birthday", "last_name"],
    parse_dates=["birthday"],
)

In [7]:
df.tail()

Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
11970,Garrett,Thomas,1972-03-27,M,rep,VA,Republican
11971,Handel,Karen,1962-04-18,F,rep,GA,Republican
11972,Jones,Brenda,1959-10-24,F,rep,MI,Democrat
11973,Marino,Tom,1952-08-15,M,rep,PA,Republican
11974,Jones,Walter,1943-02-10,M,rep,NC,Republican


In [8]:
df.dtypes

last_name             object
first_name          category
birthday      datetime64[ns]
gender              category
type                category
state               category
party               category
dtype: object

In [32]:
n_by_state = df.groupby("state", sort=False)["last_name"].count()

In [33]:
n_by_state

state
DE      97
VA     432
SC     251
MD     305
PA    1053
      ... 
AK      16
PI      13
VI       4
GU       4
AS       2
Name: last_name, Length: 58, dtype: int64

In [18]:
n_by_state.sort_values(ascending=False)

state
NY    1461
PA    1053
OH     674
IL     486
VA     432
      ... 
GU       4
VI       4
DC       2
OL       2
AS       2
Name: last_name, Length: 58, dtype: int64

You can pass a lot more than just a single column name to .groupby() as the first argument. You can also specify any of the following:

* A list of multiple column names
* A dict or Pandas Series
* A NumPy array or Pandas Index, or an array-like iterable of these

In [21]:
n_by_state_gender = df.groupby(["state", "gender"])["last_name"].count()
type(n_by_state_gender)

pandas.core.series.Series

In [22]:
n_by_state_gender.index[:5]

MultiIndex([('AK', 'M'),
            ('AL', 'F'),
            ('AL', 'M'),
            ('AR', 'F'),
            ('AR', 'M')],
           names=['state', 'gender'])

In [23]:
n_by_state_gender

state  gender
AK     M          16
AL     F           3
       M         203
AR     F           5
       M         112
                ... 
WI     M         196
WV     F           1
       M         119
WY     F           2
       M          38
Name: last_name, Length: 104, dtype: int64

In [24]:
df.groupby(["state", "gender"], as_index=False)["last_name"].count()

Unnamed: 0,state,gender,last_name
0,AK,F,
1,AK,M,16.0
2,AL,F,3.0
3,AL,M,203.0
4,AR,F,5.0
...,...,...,...
111,WI,M,196.0
112,WV,F,1.0
113,WV,M,119.0
114,WY,F,2.0


In [31]:
df.groupby("state", sort=False)["last_name"].count()

state
DE      97
VA     432
SC     251
MD     305
PA    1053
      ... 
AK      16
PI      13
VI       4
GU       4
AS       2
Name: last_name, Length: 58, dtype: int64

What is that DataFrameGroupBy thing? Its .__str__() doesn’t give you much information into what it actually is or how it works. The reason that a DataFrameGroupBy object can be difficult to wrap your head around is that it’s lazy in nature. It doesn’t really do any operations to produce a useful result until you say so.

One term that’s frequently used alongside .groupby() is split-apply-combine. This refers to a chain of three steps:

* Split a table into groups
* Apply some operations to each of those smaller tables
* Combine the results

It can be difficult to inspect df.groupby("state") because it does virtually none of these things until you do something with the resulting object. Again, a Pandas GroupBy object is lazy. It delays virtually every part of the split-apply-combine process until you invoke a method on it.

So, how can you mentally separate the split, apply, and combine stages if you can’t see any of them happening in isolation? One useful way to inspect a Pandas GroupBy object and see the splitting in action is to iterate over it. This is implemented in DataFrameGroupBy.__iter__() and produces an iterator of (group, DataFrame) pairs for DataFrames:

In [35]:
by_state = df.groupby("state")

In [38]:
for state, frame in by_state:
    print(f"First 2 entries for {state!r}")
    print("------------------------")
    print(frame.head(2), end="\n\n")

First 2 entries for 'AK'
------------------------
     last_name first_name   birthday gender type state        party
6619    Waskey      Frank 1875-04-20      M  rep    AK     Democrat
6647      Cale     Thomas 1848-09-17      M  rep    AK  Independent

First 2 entries for 'AL'
------------------------
    last_name first_name   birthday gender type state       party
912   Crowell       John 1780-09-18      M  rep    AL  Republican
991    Walker       John 1783-08-12      M  sen    AL  Republican

First 2 entries for 'AR'
------------------------
     last_name first_name   birthday gender type state party
1001     Bates      James 1788-08-25      M  rep    AR   NaN
1279    Conway      Henry 1793-03-18      M  rep    AR   NaN

First 2 entries for 'AS'
------------------------
          last_name first_name   birthday gender type state     party
10797         Sunia       Fofó 1937-03-13      M  rep    AS  Democrat
11755  Faleomavaega        Eni 1943-08-15      M  rep    AS  Democrat

F

19    Maclay    William 1737-07-20      M  sen    PA  Anti-Administration

First 2 entries for 'PI'
------------------------
             last_name first_name   birthday gender type state party
6835            Ocampo      Pablo 1853-01-25      M  rep    PI   NaN
6939  Legarda Y Tuason     Benito 1853-09-27      M  rep    PI   NaN

First 2 entries for 'PR'
------------------------
      last_name first_name   birthday gender type state       party
6426    Degetau   Federico 1862-12-05      M  rep    PR  Republican
6811  Larrinaga      Tulio 1847-01-15      M  rep    PR    Unionist

First 2 entries for 'RI'
------------------------
    last_name first_name   birthday gender type state       party
61   Bradford    William 1729-11-04      M  sen    RI  Federalist
105    Bourne   Benjamin 1755-09-09      M  rep    RI  Federalist

First 2 entries for 'SC'
------------------------
   last_name first_name   birthday gender type state               party
2      Burke    Aedanus 1743-06-16      

In [40]:
by_state.groups

{'AK': Int64Index([ 6619,  6647,  7442,  7501,  8039,  8236,  8877,  9819,  9951,
              9985, 10082, 10108, 10325, 11262, 11386, 11734],
            dtype='int64'),
 'AL': Int64Index([  912,   991,  1079,  1180,  1252,  1307,  1320,  1360,  1533,
              1621,
             ...
             11321, 11463, 11473, 11536, 11545, 11557, 11725, 11745, 11861,
             11870],
            dtype='int64', length=206),
 'AR': Int64Index([ 1001,  1279,  2096,  2254,  2318,  2421,  2466,  2472,  2598,
              3103,
             ...
             11156, 11188, 11239, 11307, 11429, 11531, 11602, 11704, 11741,
             11758],
            dtype='int64', length=117),
 'AS': Int64Index([10797, 11755], dtype='int64'),
 'AZ': Int64Index([ 3674,  3725,  3801,  4657,  4700,  5044,  5110,  5986,  6505,
              7244,  7541,  8103,  8386,  8431,  9132,  9163,  9355,  9441,
              9775,  9821,  9959, 10132, 10315, 10374, 10603, 10636, 10682,
             10744, 10817, 1096

In [43]:
by_state.groups["PA"]

Int64Index([    4,    19,    21,    27,    38,    57,    69,    76,    84,
               88,
            ...
            11842, 11866, 11875, 11877, 11887, 11891, 11932, 11945, 11959,
            11973],
           dtype='int64', length=1053)

This is virtually equivalent to using .loc[]. You could get the same output with something like df.loc[df["state"] == "PA"]

In [42]:
by_state.get_group("PA")

Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
4,Clymer,George,1739-03-16,M,rep,PA,
19,Maclay,William,1737-07-20,M,sen,PA,Anti-Administration
21,Morris,Robert,1734-01-20,M,sen,PA,Pro-Administration
27,Wynkoop,Henry,1737-03-02,M,rep,PA,
38,Jacobs,Israel,1726-06-09,M,rep,PA,
...,...,...,...,...,...,...,...
11891,Brady,Robert,1945-04-07,M,rep,PA,Democrat
11932,Shuster,Bill,1961-01-10,M,rep,PA,Republican
11945,Rothfus,Keith,1962-04-25,M,rep,PA,Republican
11959,Costello,Ryan,1976-09-07,M,rep,PA,Republican
