In [1]:
import pandas as pd

In [4]:
# Dataframe of scales in descending order with an index
# of subjective judgments on how good the students are doing
df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-',
'D+', 'D'], index= ['excellent', 'excellent', 'excellent', 'good', 'good',
'good', 'ok', 'ok', 'ok', 'poor', 'poor'], columns = ['grades'])

df


Unnamed: 0,grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [6]:
# If we check the datatype, we'll see it's just an object
# since we set string values
df.dtypes

grades    object
dtype: object

In [7]:
# With the astype() function we can tell pandas to convert it
# to categorical datatype
df['grades'].astype('category').head()

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: grades, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']

In [16]:
# Pandas now knows there are 11 possible categories of grades
# But notice our data is not only categorical but ordered, e.g
# A- comes after B+

# To let pandas know this, we need to create a new categorical
# datatype with the ordered list of the categories and set ordered = True
my_categories = pd.CategoricalDtype(categories = ['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 
'B+', 'A-', 'A', 'A+'], ordered = True)

# Now we pass this to the astype() function instead of the "categorical"
# string
grades = df['grades'].astype(my_categories)

grades.head()

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [14]:
# Now that they're ordered, we can do comparisons ands boolean masking

# Let's see how this would go with the raw dataframe
df[df['grades'] > 'C'] 

Unnamed: 0,grades
ok,C+
ok,C-
poor,D+
poor,D


In [17]:
# The result don't make any sense in the context of grades

# Now compare that with the dataframe with datatype set to 
# ordered categorical 
grades[grades > 'C']

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
Name: grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']

In [19]:
# Another useful scale transformation is converting data on
# the ratio or interval scale to categorical data

# This is useful for visualizing frequencies of categories using
# e.g histograms. Pandas has a function called cut() to help with
# this, which alos takes the # bins to be used

# For an example, remember that we could group the census data
# by state and then aggregate to get the avg county pop. size 
# by state

# We could further apply cut to this with e.g ten bins and see
# the states being listed as categorical using the avg county size
import numpy as np

In [21]:
df = pd.read_csv('datasets/census.csv')

# reduce to county level data
df = df[df['SUMLEV'] == 50]

# and to a few groups
df = df.set_index('STNAME').groupby(level = 0)['CENSUS2010POP'].agg(np.average)

df.head()

STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64

In [22]:
# To make bins of the groups we can use the cut() function
pd.cut(df, bins = 10)

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     