<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/effective_pandas_ch_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 15: Categorical Manipulation
Akin to factors in R

In [1]:
# libraries needed
import numpy as np
import pandas as pd

# show plots inline
%matplotlib inline

In [2]:
# load data
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url)

# view first view rows
df.head(n = 5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [3]:
# save make as a series
make = df['make']
print(make)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object


In [4]:
# make is an object; conver to category
make_cat = make.astype('category')
print(make_cat)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo',
                           'Wallace Environmental', 'Yugo', 'smart']


In [5]:
# make has 136 unique values
make.nunique()

136

In [7]:
# make_cat takes up less memory than make
print(make.memory_usage(deep = True))
print(make_cat.memory_usage(deep = True))

2606395
95888


In [19]:
# turn make into an ordered category
make_ordered_cat = make.astype(
    pd.CategoricalDtype(
        categories = sorted(make.unique()), ordered = True
        )
)

print(make_ordered_cat)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' <
                           'Wallace Environmental' < 'Yugo' < 'smart']


In [20]:
# with make_ordered_cat, can do things like the following:
print(make_ordered_cat.min())
# and also .max()

AM General


In [22]:
# current categories in make_cat
make_cat.cat.categories

Index(['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo',
       'American Motors Corporation', 'Aston Martin', 'Audi',
       'Aurora Cars Ltd', 'Autokraft Limited', 'Avanti Motor Corporation',
       ...
       'Toyota', 'VPG', 'Vector', 'Vixen Motor Company',
       'Volga Associated Automobile', 'Volkswagen', 'Volvo',
       'Wallace Environmental', 'Yugo', 'smart'],
      dtype='object', length=136)

In [26]:
# change categories to lower case
make_cat.cat.rename_categories(
    [x.lower() for x in make_cat.cat.categories]   # change everything to lower case
)

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['am general', 'asc incorporated', 'acura', 'alfa romeo', ..., 'volvo',
                           'wallace environmental', 'yugo', 'smart']

In [33]:
 [x.lower() for x in make_cat.cat.categories]  # square brackets indicate a list

['am general',
 'asc incorporated',
 'acura',
 'alfa romeo',
 'american motors corporation',
 'aston martin',
 'audi',
 'aurora cars ltd',
 'autokraft limited',
 'avanti motor corporation',
 'azure dynamics',
 'bmw',
 'bmw alpina',
 'byd',
 'bentley',
 'bertone',
 'bill dovell motor car company',
 'bitter gmbh and co. kg',
 'bugatti',
 'buick',
 'ccc engineering',
 'coda automotive',
 'cx automotive',
 'cadillac',
 'chevrolet',
 'chrysler',
 'consulier industries inc',
 'dabryan coach builders inc',
 'dacia',
 'daewoo',
 'daihatsu',
 'dodge',
 'e. p. dutton, inc.',
 'eagle',
 'environmental rsch and devp corp',
 'evans automobiles',
 'excalibur autos',
 'federal coach',
 'ferrari',
 'fiat',
 'fisker',
 'ford',
 'gmc',
 'general motors',
 'genesis',
 'geo',
 'goldacre',
 'grumman allied industries',
 'grumman olson',
 'honda',
 'hummer',
 'hyundai',
 'import foreign auto sales inc',
 'import trade services',
 'infiniti',
 'isis imports ltd',
 'isuzu',
 'j.k. motors',
 'jba motorcars, in

In [41]:
# for first one hundred elements in make_cat, get counts for each make
(
    make_cat
      .iloc[0:100]           # first 100, 0 thru 99
      .value_counts()        # get counts; includes all categories, not just the ones in the first 100
      .loc[lambda x: x > 0]  # only observed; remove when count isn't greater than 0
      #.sum()                 # check to make sure it's 100; comment out when done
)

Dodge            17
Oldsmobile        8
Ford              8
Buick             7
Chevrolet         5
Plymouth          5
Mazda             5
Cadillac          4
Volkswagen        4
Toyota            4
Mercury           4
Pontiac           4
Hyundai           3
Subaru            3
BMW               3
Nissan            3
Infiniti          2
CX Automotive     2
Audi              2
Volvo             2
Ferrari           1
Rolls-Royce       1
Lexus             1
Chrysler          1
Alfa Romeo        1
Name: make, dtype: int64

In [None]:
# what does .first() do?

# also part on generalizing which you skipped for now