In [1]:
import pandas as pd
import numpy as np
%config Completer.use_jedi=False

## Object creation
---

In [2]:
s = pd.Series(list('abca'), dtype='category')
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [13]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df.dtypes

A    object
dtype: object

In [14]:
df['B'] = df['A'].astype('category')
df.dtypes

A      object
B    category
dtype: object

In [15]:
df = pd.DataFrame({"value": np.random.randint(0, 100, 20)})
df

Unnamed: 0,value
0,7
1,47
2,45
3,26
4,0
5,92
6,47
7,81
8,59
9,98


In [16]:
labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
labels

['0 - 9',
 '10 - 19',
 '20 - 29',
 '30 - 39',
 '40 - 49',
 '50 - 59',
 '60 - 69',
 '70 - 79',
 '80 - 89',
 '90 - 99']

In [22]:
df["group"] = pd.cut(df.value, bins=range(0, 105, 10), right=False, labels=labels)
df.head(10)

Unnamed: 0,value,group
0,7,0 - 9
1,47,40 - 49
2,45,40 - 49
3,26,20 - 29
4,0,0 - 9
5,92,90 - 99
6,47,40 - 49
7,81,80 - 89
8,59,50 - 59
9,98,90 - 99


## DataFrame creation
---

In [23]:
df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category")
df.dtypes

A    category
B    category
dtype: object

In [24]:
df['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): ['a', 'b', 'c']

## Controlling behavior
---

In [26]:
from pandas.api.types import CategoricalDtype
s = pd.Series(["a", "b", "c", "a"])
cat_type = CategoricalDtype(categories=["b", "d", "c"], ordered=True)
s_cat = s.astype(cat_type)
s_cat

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b' < 'd' < 'c']

## Regaining original data
---

In [27]:
s = pd.Series(["a", "b", "c", "a"])
s

0    a
1    b
2    c
3    a
dtype: object

In [28]:
s2 = s.astype('category')
s2

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [32]:
s2.astype(str)

0    a
1    b
2    c
3    a
dtype: object

In [33]:
np.asarray(s2)

array(['a', 'b', 'c', 'a'], dtype=object)

## CategoricalDtype
---

In [35]:
from pandas.api.types import CategoricalDtype
CategoricalDtype(list('abc'))

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [36]:
CategoricalDtype(["a", "b", "c"], ordered=True)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

In [37]:
CategoricalDtype()

CategoricalDtype(categories=None, ordered=False)

## Equality semantics
---

In [38]:
c1 = CategoricalDtype(["a", "b", "c"], ordered=False)
c1 == CategoricalDtype(["b", "c", "a"], ordered=False)

True

In [39]:
c1 == CategoricalDtype(list('bca'), ordered=True)

False

## Description
---

In [40]:
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
cat

['a', 'c', 'c', NaN]
Categories (3, object): ['b', 'a', 'c']

In [41]:
df = pd.DataFrame({"cat": cat, "s": ["a", "c", "c", np.nan]})
df

Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,,


In [44]:
df["cat"].describe()

count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

In [46]:
df.describe()

Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


## Working with categories
---

In [48]:
s = pd.Series(list('abca'), dtype='category')
s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [49]:
s.cat.ordered

False

In [50]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"]))
s.cat.categories

Index(['c', 'b', 'a'], dtype='object')

In [51]:
s.cat.ordered

False

In [52]:
s = pd.Series(list("babc")).astype(CategoricalDtype(list("abcd")))
s

0    b
1    a
2    b
3    c
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [53]:
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [54]:
s.unique()

['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']

## Renaming categories
---

In [59]:
s = pd.Series(list('abca'), dtype='category')
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [60]:
s.cat.categories = ['Group %s' %g for g in s.cat.categories]
s

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): ['Group a', 'Group b', 'Group c']

In [61]:
s = s.cat.rename_categories([1,2,3])
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

## Appending new categories
---

In [62]:
s = s.cat.add_categories([4])
s.cat.categories

Int64Index([1, 2, 3, 4], dtype='int64')

## Removing categories
---

In [64]:
s = s.cat.remove_categories([4])
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

## Removing unused categories
---

In [65]:
s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", "c", "d"]))
s = s.cat.remove_unused_categories()
s

0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

## Setting categories
---

In [66]:
s = pd.Series(["one", "two", "four", "-"], dtype="category")
s

0     one
1     two
2    four
3       -
dtype: category
Categories (4, object): ['-', 'four', 'one', 'two']

In [67]:
s = s.cat.set_categories(["one", "two", "three", "four"])
s

0     one
1     two
2    four
3     NaN
dtype: category
Categories (4, object): ['one', 'two', 'three', 'four']

## Sorting and order
---

In [68]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False))
s.sort_values(inplace=True)
s

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [69]:
s = pd.Series(["a", "b", "c", "a"]).astype(CategoricalDtype(ordered=True))
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [70]:
s.min(), s.max()

('a', 'c')

In [71]:
s.cat.as_ordered()

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [72]:
s.cat.as_unordered()

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

## Reordering
---

In [73]:
s = pd.Series([1, 2, 3, 1], dtype="category")
s = s.cat.reorder_categories([2, 3, 1], ordered=True)
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [75]:
s.sort_values(inplace=True)
s

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [76]:
s.min(), s.max()

(2, 1)

## Multi column sorting
---

In [78]:
dfs = pd.DataFrame(
        {
            "A": pd.Categorical(
                list("bbeebbaa"),
                categories=["e", "a", "b"],
                ordered=True,
            ),
            "B": [1, 2, 1, 2, 2, 1, 2, 1],
        }
    )
dfs

Unnamed: 0,A,B
0,b,1
1,b,2
2,e,1
3,e,2
4,b,2
5,b,1
6,a,2
7,a,1


In [79]:
dfs.sort_values(by=['A', 'B'])

Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


## Comparisons
---

In [80]:
cat = pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype([3, 2, 1], ordered=True))
cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True))
cat

0    1
1    2
2    3
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [81]:
cat_base

0    2
1    2
2    2
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [82]:
cat_base2

0    2
1    2
2    2
dtype: category
Categories (1, int64): [2]

In [83]:
cat > cat_base

0     True
1    False
2    False
dtype: bool

In [84]:
cat == cat_base

0    False
1     True
2    False
dtype: bool

In [85]:
cat == np.array([1, 2, 3])

0    True
1    True
2    True
dtype: bool

In [86]:
cat == 2

0    False
1     True
2    False
dtype: bool

## Operations
---

In [87]:
s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"]))
s.value_counts()

c    2
b    1
a    1
d    0
dtype: int64

In [88]:
columns = pd.Categorical(
        ["One", "One", "Two"], categories=["One", "Two", "Three"], ordered=True
    )
df = pd.DataFrame(
        data=[[1, 2, 3], [4, 5, 6]],
        columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]),
    )
df

Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,One,One,Two
0,1,2,3
1,4,5,6


In [89]:
df.sum(axis=1, level=1)

Unnamed: 0,One,Two,Three
0,3,3,0
1,9,6,0


## Data munging
### Getting

In [96]:
idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"])
cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], dtype="category", index=idx)
values = [1, 2, 2, 2, 3, 4, 5]
df = pd.DataFrame({"cats": cats, "values": values}, index=idx)
df

Unnamed: 0,cats,values
h,a,1
i,b,2
j,b,2
k,b,2
l,c,3
m,c,4
n,c,5


In [97]:
df.iloc[2:4, :]

Unnamed: 0,cats,values
j,b,2
k,b,2


In [98]:
df.iloc[2:4, :].dtypes

cats      category
values       int64
dtype: object

In [99]:
df.loc["h":"j", "cats"]

h    a
i    b
j    b
Name: cats, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [100]:
df.loc["h", :]

cats      a
values    1
Name: h, dtype: object

In [101]:
df.iat[0, 0]

'a'

In [102]:
df["cats"].cat.categories = ["x", "y", "z"]

In [103]:
df.at["h", "cats"]  # returns a string

'x'

In [104]:
df.loc[["h"], "cats"]

h    x
Name: cats, dtype: category
Categories (3, object): ['x', 'y', 'z']

## String and datetime accessors
---

In [107]:
str_s = pd.Series(list('aabb'))
str_cat = str_s.astype('category')
str_cat

0    a
1    a
2    b
3    b
dtype: category
Categories (2, object): ['a', 'b']

In [108]:
str_cat.str.contains("a")

0     True
1     True
2    False
3    False
dtype: bool

In [109]:
date_s = pd.Series(pd.date_range("1/1/2015", periods=5))
date_cat = date_s.astype("category")
date_cat

0   2015-01-01
1   2015-01-02
2   2015-01-03
3   2015-01-04
4   2015-01-05
dtype: category
Categories (5, datetime64[ns]): [2015-01-01, 2015-01-02, 2015-01-03, 2015-01-04, 2015-01-05]

In [110]:
date_cat.dt.day

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [113]:
ret_s = str_s.str.contains("a")
ret_cat = str_cat.str.contains("a")
print(ret_s.dtype)
print(ret_cat.dtype)
ret_s == ret_cat

bool
bool


0    True
1    True
2    True
3    True
dtype: bool

## Setting
---

In [114]:
df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]})
df

Unnamed: 0,a,b
0,1,a
1,1,a
2,1,a
3,1,a
4,1,a


In [115]:
df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"])
df

Unnamed: 0,a,b
0,1,a
1,b,a
2,b,a
3,1,a
4,1,a


In [116]:
df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"])
df

Unnamed: 0,a,b
0,1,a
1,b,a
2,b,b
3,1,b
4,1,a


## Merging / concatenation
---

In [118]:
from pandas.api.types import union_categoricals
# same categories
s1 = pd.Series(["a", "b"], dtype='category')
s2 = pd.Series(["a", "b", "a"], dtype='category')
pd.concat([s1, s2])

0    a
1    b
0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

In [119]:
# different categories
s3 = pd.Series(["b", "c"], dtype="category")
pd.concat([s1, s3])

0    a
1    b
0    b
1    c
dtype: object

In [120]:
# Output dtype is inferred based on categories values
int_cats = pd.Series([1, 2], dtype="category")
float_cats = pd.Series([3.0, 4.0], dtype="category")
pd.concat([int_cats, float_cats])

0    1.0
1    2.0
0    3.0
1    4.0
dtype: float64

In [121]:
pd.concat([s1, s3]).astype("category")

0    a
1    b
0    b
1    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [122]:
union_categoricals([s1.array, s3.array])

['a', 'b', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

## Unioning
---

In [123]:
from pandas.api.types import union_categoricals
a = pd.Categorical(["b", "c"])
b = pd.Categorical(["a", "b"])
union_categoricals([a, b])

['b', 'c', 'a', 'b']
Categories (3, object): ['b', 'c', 'a']

In [124]:
union_categoricals([a, b], sort_categories=True)

['b', 'c', 'a', 'b']
Categories (3, object): ['a', 'b', 'c']

In [128]:
a = pd.Categorical(["a", "b"], ordered=True)
b = pd.Categorical(["a", "b", "a"], ordered=True)
union_categoricals([a, b])

['a', 'b', 'a', 'b', 'a']
Categories (2, object): ['a' < 'b']

In [129]:
a = pd.Categorical(["a", "b", "c"], ordered=True)
b = pd.Categorical(["c", "b", "a"], ordered=True)
print(union_categoricals([a, b]))
union_categoricals([a, b], ignore_order=True)

['a', 'b', 'c', 'c', 'b', 'a']
Categories (3, object): ['a' < 'b' < 'c']


['a', 'b', 'c', 'c', 'b', 'a']
Categories (3, object): ['a', 'b', 'c']

## Getting data in/out
---

In [130]:
import io
s = pd.Series(pd.Categorical(["a", "b", "b", "a", "a", "d"]))
print(s)
# rename categories
s.cat.categories = ["very good", "good", "bad"]
print(s)

0    a
1    b
2    b
3    a
4    a
5    d
dtype: category
Categories (3, object): ['a', 'b', 'd']
0    very good
1         good
2         good
3    very good
4    very good
5          bad
dtype: category
Categories (3, object): ['very good', 'good', 'bad']


In [131]:
# reorder the categories and add missing categories
s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
s

0    very good
1         good
2         good
3    very good
4    very good
5          bad
dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [132]:
df = pd.DataFrame({"cats": s, "vals": [1, 2, 3, 4, 5, 6]})
print(df.dtypes)
csv = io.StringIO()
df.to_csv(csv)
df2 = pd.read_csv(io.StringIO(csv.getvalue()))
df2.dtypes

cats    category
vals       int64
dtype: object


Unnamed: 0     int64
cats          object
vals           int64
dtype: object

In [133]:
df2["cats"]

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: object

In [134]:
df2["cats"] = df2["cats"].astype("category")
df2['cats'].dtypes

CategoricalDtype(categories=['bad', 'good', 'very good'], ordered=False)

In [135]:
df2["cats"].cat.set_categories(
        ["very bad", "bad", "medium", "good", "very good"], inplace=True
    )
df2['cats'].dtypes

CategoricalDtype(categories=['very bad', 'bad', 'medium', 'good', 'very good'], ordered=False)

In [136]:
df2.dtypes

Unnamed: 0       int64
cats          category
vals             int64
dtype: object

In [137]:
df2["cats"]

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

## Missing data
---

In [138]:
s = pd.Series(["a", "b", np.nan, "a"], dtype="category")
s

0      a
1      b
2    NaN
3      a
dtype: category
Categories (2, object): ['a', 'b']

In [139]:
s.cat.codes

0    0
1    1
2   -1
3    0
dtype: int8

In [140]:
s = pd.Series(["a", "b", np.nan], dtype="category")
s

0      a
1      b
2    NaN
dtype: category
Categories (2, object): ['a', 'b']

In [141]:
pd.isna(s)

0    False
1    False
2     True
dtype: bool

In [142]:
s.fillna("a")

0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

In [145]:
s = pd.Series(["foo", "bar"] * 1000)
s.nbytes

16000

In [147]:
s.astype('category').nbytes

2016