# Introduction to Categorical Data

In [1]:
import pandas as pd 
adult = pd.read_csv("datasets/adult.csv")
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
# Explore the Above/Below 50k variable
print(adult["Above/Below 50k"].describe())

# Print a frequency table of "Above/Below 50k"
print(adult["Above/Below 50k"].value_counts())

# Print relative frequency values
print(adult["Above/Below 50k"].value_counts(normalize=True))

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object
 <=50K    24720
 >50K      7841
Name: Above/Below 50k, dtype: int64
 <=50K    0.75919
 >50K     0.24081
Name: Above/Below 50k, dtype: float64


In [3]:
adult.dtypes

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object

In [4]:
# Create a dictionary with column names as keys and "category" as values
adult_dtypes = {
   "Workclass": "category",
   "Education": "category",
   "Relationship": "category",
   "Above/Below 50k": "category" 
}

# Read in the CSV using the dtypes parameter
adult2 = pd.read_csv(
  "datasets/adult.csv",
  dtype = adult_dtypes
)
print(adult2.dtypes)

Age                   int64
Workclass          category
fnlgwt                int64
Education          category
Education Num         int64
Marital Status       object
Occupation           object
Relationship       category
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k    category
dtype: object


### .groupby() : Grouping Data by Category

In [5]:
# Group the adult dataset by "Sex" and "Above/Below 50k"
gb = adult.groupby(by = ['Sex','Above/Below 50k'])

# Print out how many rows are in each created group
print(gb.size())

# Print out the mean of each group for all columns
print(gb.mean())

Sex      Above/Below 50k
 Female   <=50K              9592
          >50K               1179
 Male     <=50K             15128
          >50K               6662
dtype: int64
                               Age         fnlgwt  ...  Capital Loss  Hours/Week
Sex     Above/Below 50k                            ...                          
 Female  <=50K           36.210801  185999.381359  ...     47.364470   35.916701
         >50K            42.125530  183687.406277  ...    173.648855   40.426633
 Male    <=50K           37.147012  193093.609268  ...     56.806782   40.693879
         >50K            44.625788  188769.101321  ...    198.780396   46.366106

[4 rows x 6 columns]


In [6]:
# Create a list of user-selected variables
user_list = list(['Education','Above/Below 50k'])

# Create a GroupBy object using this list
gb = adult.groupby(by=user_list)

# Find the mean for the variable "Hours/Week" for each group - Be efficient!
print(gb['Hours/Week'].mean())

Education      Above/Below 50k
 10th           <=50K             36.574053
                >50K              43.774194
 11th           <=50K             33.322870
                >50K              45.133333
 12th           <=50K             35.035000
                >50K              44.818182
 1st-4th        <=50K             37.864198
                >50K              48.833333
 5th-6th        <=50K             38.539432
                >50K              46.000000
 7th-8th        <=50K             38.830033
                >50K              47.500000
 9th            <=50K             37.667351
                >50K              44.851852
 Assoc-acdm     <=50K             39.264339
                >50K              44.256604
 Assoc-voc      <=50K             40.817826
                >50K              43.853186
 Bachelors      <=50K             40.586152
                >50K              45.475462
 Doctorate      <=50K             45.429907
                >50K              47.513072
 

# Categorical pandas Series

In [7]:
dogs = pd.read_csv('datasets/ShelterDogs.csv')
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,


In [8]:
dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2937 non-null   int64  
 1   name               2845 non-null   object 
 2   age                2937 non-null   float64
 3   sex                2937 non-null   object 
 4   breed              2937 non-null   object 
 5   date_found         2937 non-null   object 
 6   adoptable_from     2937 non-null   object 
 7   posted             2937 non-null   object 
 8   color              2937 non-null   object 
 9   coat               2937 non-null   object 
 10  size               2937 non-null   object 
 11  neutered           1852 non-null   object 
 12  housebroken        460 non-null    object 
 13  likes_people       1999 non-null   object 
 14  likes_children     1219 non-null   object 
 15  get_along_males    1633 non-null   object 
 16  get_along_females  1673 

In [9]:
dogs['coat'] = dogs['coat'].astype('category')
dogs['coat'].value_counts(dropna=False)

short         1972
medium         565
wirehaired     220
long           180
Name: coat, dtype: int64

## The .cat accessor object
`Series.cat.method_name`

Common parameters:
- `new_categories`:a list of categories
- `inplace`:Boolean-whether or not the update should overwrite the Series 
- `ordered`:Boolean-whether or not the categorical is treated as an ordered categorical

### 1. Setting Series categories

In [10]:
#set categories
dogs['coat'] = dogs['coat'].cat.set_categories(
    new_categories=['short','medium','long']
)

# Any values not lilsted in the new-categories list will be dropped
dogs['coat'].value_counts(dropna=False)

short     1972
medium     565
NaN        220
long       180
Name: coat, dtype: int64

### 2. Setting order

In [11]:
# set the order of the categories
dogs['coat'] = dogs['coat'].cat.set_categories(
    new_categories=['short','medium','long'],
    ordered=True
)

dogs['coat'].head()

0     short
1     short
2     short
3    medium
4     short
Name: coat, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

### 3. Adding categories

In [12]:
# add categories
dogs['likes_people'] = dogs['likes_people'].astype('category')
dogs['likes_people'] = dogs['likes_people'].cat.add_categories(
    new_categories = ['did not check','could not tell'] # this doesn't mean any rows of the data were set to these categories
)

# clarify the missinf value and what it actually means 
dogs['likes_people'].cat.categories

Index(['no', 'yes', 'did not check', 'could not tell'], dtype='object')

### 4. Removing categories

In [13]:
# checking categories
dogs = pd.read_csv('datasets/ShelterDogs.csv')
dogs['coat'] = dogs['coat'].astype('category')
dogs['coat'].cat.categories

Index(['long', 'medium', 'short', 'wirehaired'], dtype='object')

In [14]:
# removing a category
dogs['coat'] = dogs['coat'].cat.remove_categories(removals=['wirehaired'])

#check the categories
dogs['coat'].cat.categories

Index(['long', 'medium', 'short'], dtype='object')

## Updating categories

In [15]:
# Breed value counts
dogs['breed'] = dogs['breed'].astype('category')
dogs['breed'].value_counts()

Unknown Mix                                 1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: breed, Length: 277, dtype: int64

### 1. Renaming categories
The `rename_categories` method:

`Series.cat.rename_categories(new_categories=dict)`

In [16]:
# make a dict
my_changes = {'Unknown Mix':'Unknown'}

# rename the category
dogs['breed'] = dogs['breed'].cat.rename_categories(my_changes)

# Breed value counts
dogs['breed'].value_counts()

Unknown                                     1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: breed, Length: 277, dtype: int64

### 2. Collapsing categories setup

In [17]:
dogs['color'] = dogs['color'].astype('category')
dogs['color'].cat.categories

Index(['apricot', 'black', 'black and brown', 'black and tan',
       'black and white', 'brown', 'brown and white', 'dotted', 'golden',
       'gray', 'gray and black', 'gray and white', 'red', 'red and white',
       'sable', 'saddle back', 'spotty', 'striped', 'tricolor', 'white',
       'wild boar', 'yellow', 'yellow-brown'],
      dtype='object')

In [18]:
# create dict 
update_colors = {
    'black and brown':'black',
    'black and tan':'black',
    'black and white':'black'
}

# use .replace method
dogs['main_color'] = dogs['color'].replace(update_colors)

# convert back to categorical
dogs['main_color'] = dogs['main_color'].astype('category')
dogs['main_color'].cat.categories

Index(['apricot', 'black', 'brown', 'brown and white', 'dotted', 'golden',
       'gray', 'gray and black', 'gray and white', 'red', 'red and white',
       'sable', 'saddle back', 'spotty', 'striped', 'tricolor', 'white',
       'wild boar', 'yellow', 'yellow-brown'],
      dtype='object')

## Reordering categories

In [19]:
dogs = pd.read_csv('datasets/ShelterDogs.csv')
dogs['coat'] = dogs['coat'].astype('category')
dogs['coat'].cat.categories

Index(['long', 'medium', 'short', 'wirehaired'], dtype='object')

In [20]:
dogs['coat'].cat.reorder_categories(
    new_categories = ['short','medium','wirehaired','long'],
    ordered = True,
    inplace = True
)

dogs['coat'].cat.categories

Index(['short', 'medium', 'wirehaired', 'long'], dtype='object')

In [21]:
dogs.groupby(by=['coat'])['age'].mean() # The average age for each group will be shown in the order of the categories of the coat column 

coat
short         8.364746
medium        9.027982
wirehaired    8.424136
long          9.552056
Name: age, dtype: float64

## Accessing and Filtering 

In [25]:
# Print the category of the coat for ID 23807
print(dogs.loc[dogs['ID']==23807,'coat'])

0    short
Name: coat, dtype: category
Categories (4, object): ['short' < 'medium' < 'wirehaired' < 'long']


In [26]:
# Find the count of male and female dogs who have a "long" coat
print(dogs.loc[dogs['coat']=='long','sex'].value_counts())

male      124
female     56
Name: sex, dtype: int64


In [27]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs.loc[dogs['breed']=='English Cocker Spaniel', 'age'].mean())

8.186153846153847


In [28]:
# Count the number of dogs that have "English" in their breed name
print(dogs[dogs["breed"].str.contains('English', regex=False)].shape[0])

35
