# Categorical Manipulation

#### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

#### Categorical Data

In [2]:
file_path = '/Users/isisromero/desktop/effective_pandas/data/vehicles.csv'
df = pd.read_csv(file_path, low_memory=False)

In [4]:
make = df.make
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

#### Frequency Counts

In [5]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
E. P. Dutton, Inc.                     1
Fisker                                 1
Panoz Auto-Development                 1
Environmental Rsch and Devp Corp       1
Grumman Allied Industries              1
Name: count, Length: 136, dtype: int64

In [6]:
make.shape, make.nunique()

((41144,), 136)

#### Benefits of Categories

In [7]:
cat_make = make.astype('category')

In [8]:
make.memory_usage(deep=True)

2606395

In [10]:
cat_make.memory_usage(deep=True)

95888

In [11]:
%%timeit
cat_make.str.upper()

408 µs ± 7.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
%%timeit
make.str.upper()

7.6 ms ± 898 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Conversion to Ordinal Categories

In [14]:
make_type = pd.CategoricalDtype(
    categories=sorted(make.unique()), ordered=True)

In [15]:
ordered_make = make.astype(make_type)

In [16]:
ordered_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [17]:
ordered_make.max()

'smart'

In [19]:
# cat_make.max()

In [20]:
ordered_make.sort_values()

20288    AM General
20289    AM General
369      AM General
358      AM General
19314    AM General
            ...    
31289         smart
31290         smart
29605         smart
22974         smart
26882         smart
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

#### The .cat Accessor

In [21]:
cat_make.cat.rename_categories(
   [c.lower() for c in cat_make.cat.categories])

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general', 'asc incorporated', 'acura', 'alfa romeo', ..., 'volvo', 'wallace environmental', 'yugo', 'smart']

In [22]:
ordered_make.cat.rename_categories(
   {c:c.lower() for c in ordered_make.cat.categories})

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general' < 'asc incorporated' < 'acura' < 'alfa romeo' ... 'volvo' < 'wallace environmental' < 'yugo' < 'smart']

In [23]:
ordered_make.cat.reorder_categories( 
    sorted(cat_make.cat.categories, key=str.lower))

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['Acura' < 'Alfa Romeo' < 'AM General' < 'American Motors Corporation' ... 'Volvo' < 'VPG' < 'Wallace Environmental' < 'Yugo']

#### Category Gotchas

In [24]:
ordered_make.iloc[:100].value_counts()

make
Dodge                  17
Oldsmobile              8
Ford                    8
Buick                   7
Plymouth                5
                       ..
Tecstar, LP             0
Tesla                   0
Texas Coach Company     0
Vector                  0
VPG                     0
Name: count, Length: 136, dtype: int64

In [25]:
(cat_make
 .iloc[:100]
 .groupby(cat_make.iloc[:100])
 .first()
)

  (cat_make


make
AM General                            NaN
ASC Incorporated                      NaN
Acura                                 NaN
Alfa Romeo                     Alfa Romeo
American Motors Corporation           NaN
                                  ...    
Volkswagen                     Volkswagen
Volvo                               Volvo
Wallace Environmental                 NaN
Yugo                                  NaN
smart                                 NaN
Name: make, Length: 136, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

In [26]:
(make
 .iloc[:100]
 .groupby(make.iloc[:100])
 .first()
)

make
Alfa Romeo          Alfa Romeo
Audi                      Audi
BMW                        BMW
Buick                    Buick
CX Automotive    CX Automotive
Cadillac              Cadillac
Chevrolet            Chevrolet
Chrysler              Chrysler
Dodge                    Dodge
Ferrari                Ferrari
Ford                      Ford
Hyundai                Hyundai
Infiniti              Infiniti
Lexus                    Lexus
Mazda                    Mazda
Mercury                Mercury
Nissan                  Nissan
Oldsmobile          Oldsmobile
Plymouth              Plymouth
Pontiac                Pontiac
Rolls-Royce        Rolls-Royce
Subaru                  Subaru
Toyota                  Toyota
Volkswagen          Volkswagen
Volvo                    Volvo
Name: make, dtype: object

In [31]:
(cat_make
 .iloc[:100]
 .groupby(cat_make.iloc[:100], observed=True)
 .first()
)

make
Alfa Romeo          Alfa Romeo
Audi                      Audi
BMW                        BMW
Buick                    Buick
CX Automotive    CX Automotive
Cadillac              Cadillac
Chevrolet            Chevrolet
Chrysler              Chrysler
Dodge                    Dodge
Ferrari                Ferrari
Ford                      Ford
Hyundai                Hyundai
Infiniti              Infiniti
Lexus                    Lexus
Mazda                    Mazda
Mercury                Mercury
Nissan                  Nissan
Oldsmobile          Oldsmobile
Plymouth              Plymouth
Pontiac                Pontiac
Rolls-Royce        Rolls-Royce
Subaru                  Subaru
Toyota                  Toyota
Volkswagen          Volkswagen
Volvo                    Volvo
Name: make, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

In [32]:
ordered_make.iloc[0]

'Alfa Romeo'

In [33]:
ordered_make.iloc[[0]]

0    Alfa Romeo
Name: make, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

#### Generalization

In [34]:
def generalize_topn(ser, n=5, other='Other'):
    topn = ser.value_counts().index[:n]
    if isinstance(ser.dtype, pd.CategoricalDtype):
        ser = ser.cat.set_categories(
            topn.set_categories(list(topn)+[other]))
    return ser.where(ser.isin(topn), other)

In [35]:
cat_make.pipe(generalize_topn, n=20, other='NA')

0            NA
1            NA
2         Dodge
3         Dodge
4        Subaru
          ...  
41139    Subaru
41140    Subaru
41141    Subaru
41142    Subaru
41143    Subaru
Name: make, Length: 41144, dtype: category
Categories (21, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Volvo', 'Hyundai', 'Chrysler', 'NA']

In [38]:
def generalize_mapping(ser, mapping, default):
    seen = None
    res = ser.astype(str)
    for old, new in mapping.items():
         mask = ser.str.contains(old)
         if seen is None:
             seen = mask
         else:
             seen |= mask
         res = res.where(~mask, new)
    res = res.where(seen, default)
    return res.astype('category')

In [39]:
generalize_mapping(cat_make, {'Ford': 'US', 'Tesla': 'US',
    'Chevrolet': 'US', 'Dodge': 'US',
    'Oldsmobile': 'US', 'Plymouth': 'US',
    'BMW': 'German'}, 'Other')

0        Other
1        Other
2           US
3           US
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (3, object): ['German', 'Other', 'US']