# Working with categorical data

Categorical data can be split in 2 different kinds:
- Ordinal: having a natural order
- Nominal: they cannot be ordered

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('../data/adult.csv')

In [None]:
df.info()

The object variables contain strings.
Pandas try to infer the type of each column. Categories are not directly flagged as categorical data but as strings.


In [None]:
df.describe()

In [None]:
# What's the amount of different values of each variable?
df.nunique()

In [None]:
# What are the different values of a specific variable?
df.workclass.value_counts()

In [None]:
# And their relative frequences
df.workclass.value_counts(normalize=True)

Object types can be converted into categorical ones by using the astype method:

In [None]:
df['marital.status'] = df['marital.status'].astype('category')

In [None]:
df['marital.status'].dtype

In [None]:
# Creating categorical series

my_data = ['A', 'A', 'B', 'A', 'C', 'B']

# Unordered
my_series1 = pd.Series(my_data, dtype='category')
print(my_series1)

In [None]:
# Ordered
my_series2 = pd.Categorical(my_data, categories=["C", "B", "A"], ordered=True)
print(my_series2)

Categorical data helps reducing memory footprint by a lot.

In order to benefit from lower memory requirements right from the beginning there is a way to pass the dtypes of the variables when reading from the source:

In [None]:
df_dtypes={
"marital.status": "category"
}

df = pd.read_csv('../data/adult.csv', dtype=df_dtypes)

df.dtypes


## Grouping data by categories



In [None]:
df = pd.read_csv('../data/adult.csv')

#the following
df1 = df[df["income"]=="<=50K"]
df2 = df[df["income"]==">50K"]
#can be replaced by 
groupby_object = df.groupby(by=["income"])

In [None]:
#We can use now functions like countm sum, mean... or our own custom functions
groupby_object.size()

In [None]:
groupby_object[["education.num", "age"]].sum()

In [None]:
groupby_object = df.groupby(by=["income", "marital.status"])

In [None]:
groupby_object[["education.num", "age"]].sum()

## Setting categorical variables

In this section we are going to see how we can manipulate categories. For the most part we are going to use the *.cat* accessor.

In [None]:
dogs = pd.read_csv('../data/ShelterDogs.csv')

In [None]:
dogs.info()

In [None]:
dogs["coat"] = dogs["coat"].astype("category")

In [None]:
dogs["coat"].value_counts()

In [None]:
# Setting the categories will assing a null categorie to the eventual categories previously assigned but not included in the list
dogs["coat"] = dogs["coat"].cat.set_categories(new_categories=["short", "medium", "long"])

In [None]:
dogs["coat"].value_counts(dropna=False)

The wirehaired category is now gone

In [None]:
dogs["coat"] = dogs["coat"].cat.set_categories(
    new_categories=["short", "medium", "long"], 
    ordered=True
)
dogs["coat"].head(3)

In [None]:
#In the case of likes_people, there is a lot of null values

dogs['likes_people'].value_counts(dropna=False)

In [None]:
dogs["likes_people"] = dogs["likes_people"].astype("category")
dogs["likes_people"] = dogs["likes_people"].cat.add_categories(["did not check", "could not tell"])

In [None]:
dogs["likes_people"].value_counts(dropna=False)

In [None]:
#We can remove categories as well
dogs['coat'] = dogs['coat'].cat.remove_categories(removals=['wirehaired'])

In [None]:
dogs['coat'] = dogs['coat'].cat.

## Updating and collapsing categories

In [None]:
dogs['breed'] = dogs['breed'].astype('category')

In [None]:
dogs['breed'].value_counts()

In [None]:
#Renaming one or more categories
dogs['breed'] = dogs['breed'].cat.rename_categories(new_categories={
   'Unknown Mix': 'Unknown'
})

In [None]:
dogs['breed'].value_counts()

In [None]:
# Renaming can be done with lambdas too
dogs['sex'] = dogs['sex'].astype('category')
dogs['sex'] = dogs['sex'].cat.rename_categories(lambda x: x.title())

In [None]:
dogs['sex'].value_counts()

In [None]:
dogs['color'] = dogs['color'].astype("category")
print(dogs['color'].cat.categories)

In [None]:
update_colors = {
    "black and brown": "black",
    "black and tan": "black",
    "black and white": "black"
}

In [None]:
dogs["main_color"] = dogs["color"].replace(update_colors)

In [None]:
dogs["main_color"].value_counts()

In [None]:
dogs["main_color"].dtype