# Categorial Data

In [None]:
# import numpy and pandas
import numpy as np
import pandas as pd

## Creating Categoricals

In [None]:
# create a categorical directly from a list.  
lmh_values = ["low", "high", "medium", "medium", "high"]
lmh_cat = pd.Categorical(lmh_values)
lmh_cat

In [None]:
# examine the categories
lmh_cat.categories

In [None]:
# retrieve the values
lmh_cat.astype(str)

In [None]:
# .codes shows the integer mapping for each value of the categorical
lmh_cat.codes

In [None]:
# create from list but explicitly state the categories
lmh_cat = pd.Categorical(lmh_values, categories=["low", "medium", "high"])
lmh_cat

In [None]:
# sorting is done using the codes underlying each value
lmh_cat.sort_values()

In [None]:
# create a categorical using a Series and dtype
cat_series = pd.Series(lmh_values, dtype="category")
cat_series

In [None]:
# create an ordered categorical of precious metals
# order is important for determining relative value
metal_values = ["bronze", "gold", "silver", "bronze"]
metal_categories = ["bronze", "silver", "gold"]
metals = pd.Categorical(metal_values,
    categories=metal_categories,
    ordered = True)
metals

In [None]:
# reverse the metals
metals_reversed_values = pd.Categorical(
    metals.astype(str)[::-1],
    categories = metals.categories, 
    ordered=True)
metals_reversed_values

In [None]:
# compare the two categoricals
metals <= metals_reversed_values

In [None]:
# codes are the integer value assocaited with each item
metals.codes

In [None]:
# and for metals2
metals_reversed_values.codes

In [None]:
# creating a categorical with a non existent category
pd.Categorical(["bronze", "copper"], categories=metal_categories)

# Renaming Categories

In [None]:
# create a categorical with 3 categories
cat = pd.Categorical(["a","b","c","a"], categories=["a", "b", "c"])
cat

In [None]:
# renames the categories (and also the values)
cat.categories = ["bronze", "silver", "gold"]
cat

In [None]:
# this also renames 
cat.rename_categories(["x", "y", "z"])

In [None]:
# the rename is not done in-place
cat

# Appending new categories

In [None]:
# add a new platimnum category
with_platinum = metals.add_categories(["platinum"])
with_platinum

# Removing Categories

In [None]:
# remove bronze category
no_bronze = metals.remove_categories(["bronze"])
no_bronze

# Removing unused categories

In [None]:
# remove any unused categories (in this case, platinum)
with_platinum.remove_unused_categories()

# Setting categories

In [None]:
# sample Series
s = pd.Series(["one","two","four", "five"], dtype="category")
s

In [None]:
# remove the "two", "three" and "five" categories (replaced with NaN)
s = s.cat.set_categories(["one","four"])
s

# Describe

In [None]:
# get descriptive info on the metals categorical
metals.describe()

# Value counts

In [None]:
# count the values in the categorical
metals.value_counts()

# Minimum, maximum and mode

In [None]:
# find the min, max and mode of the metals categorical
(metals.min(), metals.max(), metals.mode())

# Munging school grades

In [None]:
# 10 students with random grades
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 
         'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name': names, 'Grade': grades})
scores

In [None]:
# bins and their mappings to letter grades
score_bins =    [ 0,  59,   62,  66,   69,   72,  76,   79,   82,  
                 86,   89,   92,  99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 
                 'B+', 'A-', 'A', 'A+']

In [None]:
# cut based upon the bins and assign the letter grade
letter_cats = pd.cut(scores.Grade, score_bins, labels=letter_grades)
scores['Letter'] = letter_cats
scores

In [None]:
# examine the underlying categorical
letter_cats

In [None]:
# how many of each grade occurred?
scores.Letter.value_counts()

In [None]:
# and sort by letter grade instead of numeric grade
scores.sort_values(by=['Letter'], ascending=False)