# Categorical pandas Series

## Imports

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np

In [25]:
dogs = pd.read_csv('../data/ShelterDogs.csv')
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,


## Setting category variables

### Adding categories

In [12]:
# Check frequency counts while also printing the NaN count
print(dogs['keep_in'].value_counts(dropna=False))

keep_in
both flat and garden    1224
NaN                     1021
garden                   510
flat                     182
Name: count, dtype: int64


In [13]:
# Switch to a categorical variable
dogs["keep_in"] = dogs['keep_in'].astype('category')

In [14]:
# Add new categories
new_categories = ["Unknown History", "Open Yard (Countryside)"]
dogs["keep_in"] = dogs['keep_in'].cat.add_categories(
    new_categories=new_categories
)

In [15]:
# Check frequency counts one more time
print(dogs['keep_in'].value_counts(dropna=False))

keep_in
both flat and garden       1224
NaN                        1021
garden                      510
flat                        182
Unknown History               0
Open Yard (Countryside)       0
Name: count, dtype: int64


### Removing categories

In [26]:
dogs["likes_children"] = dogs["likes_children"].fillna('maybe').astype('category')

In [27]:
# Set "maybe" to be "no"
dogs.loc[dogs["likes_children"] == "maybe", "likes_children"] = "no"

# Print out categories
print(dogs["likes_children"].cat.categories)

Index(['maybe', 'no', 'yes'], dtype='object')


In [28]:
# Print the frequency table
print(dogs['likes_children'].value_counts(dropna=False))

likes_children
no       1765
yes      1172
maybe       0
Name: count, dtype: int64


In [29]:
# Remove the "maybe" category
dogs["likes_children"] = dogs["likes_children"].cat.remove_categories(removals=['maybe'])
print(dogs["likes_children"].value_counts())

likes_children
no     1765
yes    1172
Name: count, dtype: int64


In [30]:
# Print the categories one more time
print(dogs["likes_children"].cat.categories)

Index(['no', 'yes'], dtype='object')


## Updating categories

### Renaming categories

In [34]:
dogs = pd.read_csv('../data/dogs.csv')
dogs['likes_children'] = dogs['likes_children'].astype('category')

In [36]:
# Create the my_changes dictionary
my_changes = {
    'Maybe?': 'Maybe'
}

# Rename the categories listed in the my_changes dictionary
dogs["likes_children"] = dogs["likes_children"].cat.rename_categories(my_changes)

# Use a lambda function to convert all categories to uppercase using upper()
dogs["likes_children"] =  dogs["likes_children"].cat.rename_categories(lambda c: c.upper())

# Print the list of categories
print(dogs["likes_children"].cat.categories)

Index(['MAYBE', 'NO', 'YES'], dtype='object')


### Collapsing categories

In [40]:
# Create the update_coats dictionary
update_coats = {
    'wirehaired': 'medium',
    'medium-long': 'medium',
}

# Create a new column, coat_collapsed
dogs["coat_collapsed"] = dogs['coat'].replace(update_coats)

# Convert the column to categorical
dogs["coat_collapsed"] = dogs["coat_collapsed"].astype('category')

# Print the frequency table
print(dogs["coat_collapsed"].value_counts())

coat_collapsed
short     1972
medium     785
long       180
Name: count, dtype: int64


## Reordering categories

### Reordering categories in a Series

In [41]:
dogs = pd.read_csv('../data/dogs.csv', dtype={'size': 'category'})

In [43]:
# Print out the current categories of the size variable
print(dogs['size'].cat.categories)

Index(['large', 'medium', 'small'], dtype='object')


In [46]:
# Reorder the categories using the list provided
dogs["size"] = dogs['size'].cat.reorder_categories(
    new_categories=['small', 'medium', 'large'],
    ordered=False
)

In [47]:
# Reorder the categories, specifying the Series is ordinal
dogs["size"] = dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True
)

In [51]:
# Reorder the categories, specifying the Series is ordinal, and overwriting the original series
dogs["size"] = dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
#   inplace=True
)

# reorder_categories no longer support 'inplace'

### Using .groupby() after reordering

In [53]:
# Previous code
dogs["size"] = dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
#   inplace=True
)

# How many Male/Female dogs are available of each size?
print(dogs.groupby('size')['sex'].value_counts())

# Do larger dogs need more room to roam?
print(dogs.groupby('size')['keep_in'].value_counts())

size    sex   
small   male       260
        female     214
medium  male      1090
        female     854
large   male       331
        female     188
Name: count, dtype: int64
size    keep_in             
small   both flat and garden    238
        flat                     80
        garden                   21
medium  both flat and garden    795
        garden                  317
        flat                     97
large   both flat and garden    191
        garden                  172
        flat                      5
Name: count, dtype: int64


## Cleaning and accessing data

### Cleaning variables

In [57]:
# Fix the misspelled word
replace_map = {"Malez": "male"}

# Update the sex column using the created map
dogs["sex"] = dogs["sex"].replace(replace_map)

# Strip away leading whitespace
dogs["sex"] = dogs["sex"].str.strip()

# Make all responses lowercase
dogs["sex"] = dogs["sex"].str.lower()

# Convert to a categorical Series
dogs["sex"] = dogs['sex'].astype('category')

print(dogs["sex"].value_counts())

sex
male      1681
female    1256
Name: count, dtype: int64


### Accessing and filtering data

In [64]:
dogs = pd.read_csv('../data/dogs.csv', index_col='ID')

dogs['coat'] = dogs['coat'].astype('category')

In [65]:
# Print the category of the coat for ID 23807
print(dogs.loc[23807, 'coat'])

short


In [61]:
# Find the count of male and female dogs who have a "long" coat
print(dogs.loc[dogs['coat'] == 'long', 'sex'].value_counts())

sex
male      124
female     56
Name: count, dtype: int64


In [62]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs.loc[dogs['breed'] == "English Cocker Spaniel", 'age'].mean())

8.186153846153847


In [63]:
# Count the number of dogs that have "English" in their breed name
print(dogs[dogs["breed"].str.contains('English', regex=False)].shape[0])

35
