In [1]:
# https://www.ritchieng.com/pandas-selecting-multiple-rows-and-columns/
import pandas as pd
url = 'http://bit.ly/uforeports'
ufo = pd.read_csv(url)
# show first 3 shows
ufo.head(3)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [2]:
# .loc DataFrame method
# filtering rows and selecting columns by label

# format
# ufo.loc[rows, columns]

# row 0, all columns
ufo.loc[0, :]

City                       Ithaca
Colors Reported               NaN
Shape Reported           TRIANGLE
State                          NY
Time               6/1/1930 22:00
Name: 0, dtype: object

In [3]:
# rows 0, 1, 2
# all columns

ufo.loc[[0, 1, 2], :]

# more efficient code
ufo.loc[0:2, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [4]:
# if you leave off ", :" pandas would assume it's there
# but you should leave it there to improve code readability
ufo.loc[0:2]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [5]:
# all rows
# column: City
ufo.loc[:, 'City']

0                      Ithaca
1                 Willingboro
2                     Holyoke
3                     Abilene
4        New York Worlds Fair
                 ...         
18236              Grant Park
18237             Spirit Lake
18238             Eagle River
18239             Eagle River
18240                    Ybor
Name: City, Length: 18241, dtype: object

In [6]:
# all rows
# column: City, State
ufo.loc[:, ['City', 'State']]

# similar code for City through State
ufo.loc[:, 'City':'State']

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
...,...,...,...,...
18236,Grant Park,,TRIANGLE,IL
18237,Spirit Lake,,DISK,IA
18238,Eagle River,,,WI
18239,Eagle River,RED,LIGHT,WI


In [7]:
# multiple rows and multiple columns
ufo.loc[0:2, 'City':'State']

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO


In [8]:
# filter using City=='Oakland'
ufo[ufo.City=='Oakland']

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
1694,Oakland,,CIGAR,CA,7/21/1968 14:00
2144,Oakland,,DISK,CA,8/19/1971 0:00
4686,Oakland,,LIGHT,MD,6/1/1982 0:00
7293,Oakland,,LIGHT,CA,3/28/1994 17:00
8488,Oakland,,,CA,8/10/1995 21:45
8768,Oakland,,,CA,10/10/1995 22:40
10816,Oakland,,LIGHT,OR,10/1/1997 21:30
10948,Oakland,,DISK,CA,11/14/1997 19:55
11045,Oakland,,TRIANGLE,CA,12/10/1997 1:30
12322,Oakland,,FIREBALL,CA,10/9/1998 19:40


In [9]:
# easier-to-read code
# here you specify the rows and columns you want
# ufo.loc[rows, columns]

ufo.loc[ufo.City=='Oakland', :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
1694,Oakland,,CIGAR,CA,7/21/1968 14:00
2144,Oakland,,DISK,CA,8/19/1971 0:00
4686,Oakland,,LIGHT,MD,6/1/1982 0:00
7293,Oakland,,LIGHT,CA,3/28/1994 17:00
8488,Oakland,,,CA,8/10/1995 21:45
8768,Oakland,,,CA,10/10/1995 22:40
10816,Oakland,,LIGHT,OR,10/1/1997 21:30
10948,Oakland,,DISK,CA,11/14/1997 19:55
11045,Oakland,,TRIANGLE,CA,12/10/1997 1:30
12322,Oakland,,FIREBALL,CA,10/9/1998 19:40


In [10]:
# again, specifying the rows and columns you want
# this would be the best way to do it compared to chain indexing 
ufo.loc[ufo.City=='Oakland', 'State']

1694     CA
2144     CA
4686     MD
7293     CA
8488     CA
8768     CA
10816    OR
10948    CA
11045    CA
12322    CA
12941    CA
16803    MD
17322    CA
Name: State, dtype: object

In [11]:
# chain indexing 
# there may be issues in some cases
# try not to use this
ufo[ufo.City=='Oakland'].State

1694     CA
2144     CA
4686     MD
7293     CA
8488     CA
8768     CA
10816    OR
10948    CA
11045    CA
12322    CA
12941    CA
16803    MD
17322    CA
Name: State, dtype: object

In [12]:
ufo.iloc[:, [0, 3]]

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
...,...,...
18236,Grant Park,IL
18237,Spirit Lake,IA
18238,Eagle River,WI
18239,Eagle River,WI


In [13]:
# iloc excludes 4 (compared to loc where it includes 4)
# iloc includes 0
ufo.iloc[:, 0:4]

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
...,...,...,...,...
18236,Grant Park,,TRIANGLE,IL
18237,Spirit Lake,,DISK,IA
18238,Eagle River,,,WI
18239,Eagle River,RED,LIGHT,WI


In [14]:
# this is the major difference
# exclusive of 3
ufo.iloc[0:3, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [15]:
# non-explicit code
ufo[['City', 'State']]

# explicit code
ufo.loc[:, ['City', 'State']]

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
...,...,...
18236,Grant Park,IL
18237,Spirit Lake,IA
18238,Eagle River,WI
18239,Eagle River,WI


In [16]:
# ambiguous code again, are we referring to rows or columns?
ufo[0:2]

# use iloc!
ufo.iloc[0:2, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00


In [18]:
drinks_url = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(drinks_url, index_col='country')
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [19]:
drinks.ix['Albania', 0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return getattr(section, self.name)[new_key]


89

In [20]:
drinks.ix[1, 'beer_servings']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


89

In [21]:
# for rows, .ix is inclusive from start to end
# for columns, .ix is exclusive of end but inclusive of start
ufo.ix[0:2, 0:2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)


Unnamed: 0,City,Colors Reported
0,Ithaca,
1,Willingboro,
2,Holyoke,


In [28]:
#https://www.ritchieng.com/pandas-inplace-parameter/
import pandas as pd
url = 'http://bit.ly/uforeports'
ufo = pd.read_csv(url)
ufo.shape

(18241, 5)

In [29]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [30]:
# dropping City column
ufo.drop('City', axis=1).head()

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00


In [31]:
# you can see that the City column is not gone 
# drop() method has inplace=False as default
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [32]:
# you want to change to inplace=True to affect the underlying data
ufo.drop('City', axis=1, inplace=True)
ufo.head()

Unnamed: 0,Colors Reported,Shape Reported,State,Time
0,,TRIANGLE,NY,6/1/1930 22:00
1,,OTHER,NJ,6/30/1930 20:00
2,,OVAL,CO,2/15/1931 14:00
3,,DISK,KS,6/1/1931 13:00
4,,LIGHT,NY,4/18/1933 19:00


In [33]:
# dropna with how='any' would drop any row with 'NaN'
ufo.dropna(how='any').shape

(2490, 4)

In [34]:
ufo.shape
# as you can see, we lose a lot of rows because of dropna
# but the underlying data has not been affected because inplace=False for .dropna()

(18241, 4)

In [35]:
# some examples with inplace=False
# most are set to False

# ufo.set_index()
# ufo.rename()
# you can not use inplace=True and use an assignment instead
ufo = ufo.set_index('Time')
ufo.tail()

Unnamed: 0_level_0,Colors Reported,Shape Reported,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12/31/2000 23:00,,TRIANGLE,IL
12/31/2000 23:00,,DISK,IA
12/31/2000 23:45,,,WI
12/31/2000 23:45,RED,LIGHT,WI
12/31/2000 23:59,,OVAL,FL


In [36]:
ufo.fillna(method='bfill').tail()

Unnamed: 0_level_0,Colors Reported,Shape Reported,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12/31/2000 23:00,RED,TRIANGLE,IL
12/31/2000 23:00,RED,DISK,IA
12/31/2000 23:45,RED,LIGHT,WI
12/31/2000 23:45,RED,LIGHT,WI
12/31/2000 23:59,,OVAL,FL


In [37]:
ufo.fillna(method='ffill').tail()

Unnamed: 0_level_0,Colors Reported,Shape Reported,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12/31/2000 23:00,RED,TRIANGLE,IL
12/31/2000 23:00,RED,DISK,IA
12/31/2000 23:45,RED,DISK,WI
12/31/2000 23:45,RED,LIGHT,WI
12/31/2000 23:59,RED,OVAL,FL


In [38]:
# https://www.ritchieng.com/pandas-making-dataframe-smaller-faster/
import pandas as pd
url = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(url)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [39]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 7.6+ KB


In [40]:
# we can count the actual memory usage using the following command
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 19.9 KB


In [41]:
# we can check how much space each column is actually taking
# the numbers are in bytes, not kilobytes
drinks.memory_usage(deep=True)

Index                             64
country                         7184
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                       6928
dtype: int64

In [42]:
type(drinks.memory_usage(deep=True))

pandas.core.series.Series

In [43]:
# since it is a series, we can use .sum()
drinks.memory_usage(deep=True).sum()

20352

In [44]:
# there are only 6 unique values of continent
# we can replace strings with digits to save space
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [45]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [47]:
# converting continent from object to category 
# it stores the strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [48]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [49]:
# .cat is similar to .str
# we can do more stuff after .cat
# we can see here how pandas represents the continents as integers
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [50]:
# before this conversion, it was over 12332 bytes
# now it is 584 bytes
drinks.memory_usage(deep=True)

Index                             64
country                         7184
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                        512
dtype: int64

In [51]:
# we can convert country to a category too
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [52]:
drinks['country'] = drinks.country.astype('category')
# this is larger! 
# this is because we've too many categories
drinks.memory_usage(deep=True)

Index                              64
country                         10642
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         512
dtype: int64

In [53]:
# now we've 193 digits
# it points to a lookup table with 193 strings!
drinks.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

In [54]:
#The key to converting to category is to ensure that there are few categories to save memory usage. 
#If there are too many, we should not convert.

# passing a dictionary {} to the DataFrame method = 
id_list =[100, 101, 102, 103]
quality_list = ['good', 'very good', 'good', 'excellent']
df = pd.DataFrame({'ID': id_list, 'quality': quality_list })
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [55]:
# this sorts using alphabetical order
# but there is a logical ordering to these categories, we need to tell pandas there is a logical ordering
df.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [63]:
# how do we tell pandas there is a logical order?
quality_list_ordered = ['good', 'very good', 'excellent']
#df['quality'] = df.quality.astype('category', categories = quality_list_ordered, ordered=True)


In [64]:
cat_dtype = pd.api.types.CategoricalDtype(categories=quality_list_ordered, ordered=True)
df['quality'] = df.quality.astype(cat_dtype)

In [65]:
# here we have good < very good < excellent
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [67]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [68]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df["B"] = df["A"].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [70]:
import numpy as np
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)

Unnamed: 0,value,group
0,2,0 - 9
1,13,10 - 19
2,83,80 - 89
3,38,30 - 39
4,8,0 - 9
5,87,80 - 89
6,77,70 - 79
7,9,0 - 9
8,80,80 - 89
9,12,10 - 19


In [71]:
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)

Unnamed: 0,value,group
0,44,40 - 49
1,81,80 - 89
2,42,40 - 49
3,72,70 - 79
4,80,80 - 89
5,81,80 - 89
6,80,80 - 89
7,76,70 - 79
8,29,20 - 29
9,15,10 - 19


In [73]:
raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"],ordered=False)
s = pd.Series(raw_cat)
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [74]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df["B"] = raw_cat
df

Unnamed: 0,A,B
0,a,
1,b,b
2,c,c
3,a,


In [75]:
df.dtypes

A      object
B    category
dtype: object

In [76]:
from pandas.api.types import CategoricalDtype

s = pd.Series(["a", "b", "c", "a"])

cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
s_cat = s.astype(cat_type)
s_cat

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b < c < d]

In [77]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
cat_type = CategoricalDtype(categories=list('abcd'), ordered=True)
df_cat = df.astype(cat_type)
df_cat['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (4, object): [a < b < c < d]

In [78]:
df_cat['B']

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (4, object): [a < b < c < d]

In [79]:
splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5])

s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
s

0     test
1     test
2     test
3    train
4     test
dtype: category
Categories (2, object): [train, test]

In [122]:
s = pd.Series(["a", "b", "c", "a"])
s

0    a
1    b
2    c
3    a
dtype: object

In [123]:
original_dtype = s.dtype
original_dtype

dtype('O')

In [124]:
s2 = s.astype('category')
s2

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [125]:
str

str

In [129]:
s2 = s2.astype(str)
s2

0    a
1    b
2    c
3    a
dtype: object

In [130]:
s2 = s.astype('category')
s2

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [131]:
np.asarray(s2)
ar

array(['a', 'b', 'c', 'a'], dtype=object)

In [132]:
CategoricalDtype(['a', 'b', 'c'])

CategoricalDtype(categories=['a', 'b', 'c'], ordered=None)

In [133]:
CategoricalDtype(['a', 'b', 'c'], ordered=True)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

In [134]:
CategoricalDtype()

CategoricalDtype(categories=None, ordered=None)

In [135]:
c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)

In [136]:
# Equal, since order is not considered when ordered=False
c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False)

True

In [137]:
# Unequal, since the second CategoricalDtype is ordered
c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True)

False

In [138]:
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])

df = pd.DataFrame({"cat": cat, "s": ["a", "c", "c", np.nan]})

df.describe()

Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


In [139]:
df["cat"].describe()

count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

In [140]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")
s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [141]:
s.cat.ordered

False

In [142]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"]))
s.cat.categories

Index(['c', 'b', 'a'], dtype='object')

In [143]:
s.cat.ordered

False

In [144]:
s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))
s

0    b
1    a
2    b
3    c
dtype: category
Categories (4, object): [a, b, c, d]

In [145]:
# categories
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [146]:
# uniques
s.unique()

[b, a, c]
Categories (3, object): [b, a, c]

In [147]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [148]:
s.cat.categories = ["Group %s" % g for g in s.cat.categories]
s

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): [Group a, Group b, Group c]

In [149]:
s = s.cat.rename_categories([1, 2, 3])
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

In [150]:
# You can also pass a dict-like object to map the renaming
s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'})
s

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): [x, y, z]

In [151]:
try:
    s.cat.categories = [1, 1, 1]
except ValueError as e:
    print("ValueError:", str(e))

ValueError: Categorical categories must be unique


In [152]:
try:
    s.cat.categories = [1, 2, np.nan]
except ValueError as e:
    print("ValueError:", str(e))

ValueError: Categorial categories cannot be null
