In [1]:
import pandas as pd
import numpy as np

# Categorical Data

## Background and Motivation

In [2]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [3]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [4]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [5]:
values.value_counts()

apple     6
orange    2
dtype: int64

In [6]:
values = pd.Series([0, 1, 0, 0] * 2)

In [10]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [7]:
dim = pd.Series(['apple', 'orange'])

In [9]:
dim

0     apple
1    orange
dtype: object

> [pandas.Series.take](https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.Series.take.html)

In [11]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

## Categorical Type in pandas

In [12]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [14]:
N = len(fruits)
N

8

In [34]:
df = pd.DataFrame({'fruit': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(3, 15, size=N),
    'weight': np.random.uniform(0, 4, size=N)},
    columns=['basket_id', 'fruit', 'count', 'weight'])

In [35]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,13,3.497052
1,1,orange,10,0.518632
2,2,apple,4,1.954672
3,3,apple,10,2.214954
4,4,apple,3,2.240879
5,5,orange,10,3.436351
6,6,apple,5,0.839761
7,7,apple,7,3.731763


In [36]:
type(df["fruit"].iloc[0])

str

In [37]:
df["fruit"]

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: object

In [38]:
fruit_cat = df["fruit"].astype("category")

In [39]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [40]:
c = fruit_cat.values

In [41]:
print(c)

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']


In [42]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [43]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

> Notice that the dtype is NumPy’s int8, an 8-bit signed integer that can take on values from -127 to 128. (Only a single byte is needed to represent a value in memory. 64-bit signed ints would be overkill in terms of memory usage.) Our rough-hewn example resulted in int64 data by default, whereas Pandas is smart enough to downcast categorical data to the smallest numerical dtype possible.

In [44]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,13,3.497052
1,1,orange,10,0.518632
2,2,apple,4,1.954672
3,3,apple,10,2.214954
4,4,apple,3,2.240879
5,5,orange,10,3.436351
6,6,apple,5,0.839761
7,7,apple,7,3.731763


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   basket_id  8 non-null      int32  
 1   fruit      8 non-null      object 
 2   count      8 non-null      int32  
 3   weight     8 non-null      float64
dtypes: float64(1), int32(2), object(1)
memory usage: 320.0+ bytes


In [30]:
df["fruit"] = df["fruit"].astype("category")

In [32]:
df["fruit"]

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   basket_id  8 non-null      int32   
 1   fruit      8 non-null      category
 2   count      8 non-null      int32   
 3   weight     8 non-null      float64 
dtypes: category(1), float64(1), int32(2)
memory usage: 388.0 bytes


In [47]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [48]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [49]:
my_categories.codes

array([2, 0, 1, 2, 0], dtype=int8)

## Better performance with categoricals

In [51]:
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [55]:
labels.value_counts()

foo    2500000
bar    2500000
baz    2500000
qux    2500000
dtype: int64

In [58]:
labels.head(10)

0    foo
1    bar
2    baz
3    qux
4    foo
5    bar
6    baz
7    qux
8    foo
9    bar
dtype: object

In [56]:
categories = labels.astype("category")

In [None]:
categories.head(10)

In [60]:
labels.memory_usage(deep=True)

600000128

In [61]:
categories.memory_usage(deep=True)

10000540

In [62]:
600000128/10000540

59.99677297425939

In [63]:
%timeit _ = labels.astype("category")

759 ms ± 51.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# GroupBy je dosti hitrejši

## Categorical Methods

In [64]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)

In [65]:
cat_s = s.astype("category")

In [66]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [67]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [68]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [69]:
cat_s3 = cat_s[cat_s.isin(["a","b"])]

In [70]:
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [71]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

In [72]:
colors = pd.Series(['periwinkle', 'mint green', 'burnt orange',
                     'periwinkle', 'burnt orange', 'rose', 
                     'rose', 'mint green', 'rose', 'navy'])

ccolors = colors.astype('category')

In [73]:
colors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: object

In [74]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (5, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose']

In [75]:
try:
     ccolors.iloc[5] = 'a new color'
except ValueError as e:
    print(e)

Cannot setitem on a Categorical with a new category, set the categories first


In [76]:
ccolors = ccolors.cat.add_categories(["a new color"])

In [78]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose', 'a new color']

In [80]:
ccolors.iloc[5] = "a new color"

In [81]:
ccolors.cat.codes

0    3
1    1
2    0
3    3
4    0
5    5
6    4
7    1
8    4
9    2
dtype: int8

## Example: Using The Pandas Category Data Type

### Data Preparation

First, set up imports and read in all the data:

In [82]:
df_raw = pd.read_csv('data/category_example_data.csv', low_memory=False)

In [83]:
df_raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 565.7 MB


In [84]:
df_raw.head(5)

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,...,Preclinical_Research_Indicator,Delay_in_Publication_Indicator,Name_of_Study,Dispute_Status_for_Publication,Record_ID,Program_Year,Payment_Publication_Date,ClinicalTrials_Gov_Identifier,Research_Information_Link,Context_of_Research
0,UNCHANGED,Covered Recipient Teaching Hospital,,110079.0,5085.0,GRADY MEMORIAL HOSPITAL,,,,,...,No,No,PCYC-1134M-CA,No,493381041,2017,06/28/2019,,,informCLL A Disease Registry for Patients with...
1,UNCHANGED,Covered Recipient Teaching Hospital,,520078.0,5350.0,ST. FRANCIS HOSPITAL,,,,,...,No,No,Dimethyl Fumarate (DMF) Observational Study,No,455805444,2017,06/28/2019,,,
2,UNCHANGED,Covered Recipient Physician,,,,,296787.0,BERNARD,N,STULBERG,...,No,No,COC,No,501931627,2017,06/28/2019,,,
3,UNCHANGED,Covered Recipient Teaching Hospital,,460009.0,5865.0,U OF U HOSPITALS & CLINICS,,,,,...,No,No,PALLASPALBOCICLIB COLLABORATIVE ADJUVANT STUDY...,No,501847091,2017,06/28/2019,,,
4,UNCHANGED,Covered Recipient Teaching Hospital,,70033.0,4822.0,DANBURY HOSPITAL,,,,,...,No,No,LSS OF 4 SITE,No,446594329,2017,06/28/2019,NCT02097290,,


In [86]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 134.3+ MB


In [88]:
drop_thresh = df_raw.shape[0]*0.9

In [89]:
drop_thresh

89999.1

In [90]:
df = df_raw.dropna(thresh=drop_thresh, how="all",axis="columns").copy()

In [93]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 23 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Change_Type                                                       99999 non-null  object 
 1   Covered_Recipient_Type                                            99999 non-null  object 
 2   Recipient_Primary_Business_Street_Address_Line1                   99999 non-null  object 
 3   Recipient_City                                                    99999 non-null  object 
 4   Recipient_State                                                   99999 non-null  object 
 5   Recipient_Zip_Code                                                99999 non-null  object 
 6   Recipient_Country                                                 99999 non-null  object 
 7   Submitting_Applicable_Manufactu

In [94]:
temp = [ (col, df[col].nunique()) for col in df.columns]

In [95]:
temp

[('Change_Type', 4),
 ('Covered_Recipient_Type', 2),
 ('Recipient_Primary_Business_Street_Address_Line1', 6501),
 ('Recipient_City', 1874),
 ('Recipient_State', 52),
 ('Recipient_Zip_Code', 3983),
 ('Recipient_Country', 1),
 ('Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 465),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 520),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name', 525),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State', 34),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country', 18),
 ('Related_Product_Indicator', 2),
 ('Total_Amount_of_Payment_USDollars', 34695),
 ('Date_of_Payment', 361),
 ('Form_of_Payment_or_Transfer_of_Value', 3),
 ('Preclinical_Research_Indicator', 2),
 ('Delay_in_Publication_Indicator', 1),
 ('Name_of_Study', 7123),
 ('Dispute_Status_for_Publication', 2),
 ('Record_ID', 99999),
 ('Program_Year', 1),
 ('Payment_Publication_Date', 1)]

In [96]:
sorted(temp,key=lambda x: x[1])

[('Recipient_Country', 1),
 ('Delay_in_Publication_Indicator', 1),
 ('Program_Year', 1),
 ('Payment_Publication_Date', 1),
 ('Covered_Recipient_Type', 2),
 ('Related_Product_Indicator', 2),
 ('Preclinical_Research_Indicator', 2),
 ('Dispute_Status_for_Publication', 2),
 ('Form_of_Payment_or_Transfer_of_Value', 3),
 ('Change_Type', 4),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country', 18),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State', 34),
 ('Recipient_State', 52),
 ('Date_of_Payment', 361),
 ('Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 465),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 520),
 ('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name', 525),
 ('Recipient_City', 1874),
 ('Recipient_Zip_Code', 3983),
 ('Recipient_Primary_Business_Street_Address_Line1', 6501),
 ('Name_of_Study', 7123),
 ('Total_Amount_of_Payment_USDollars', 34695),
 ('Record_ID', 99999)]

> [pandas.DataFrame.from_records](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html)

In [101]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                                         columns=['Column_Name', 'Num_Unique'])

In [100]:
unique_counts

Unnamed: 0,Column_Name,Num_Unique
0,Change_Type,4
1,Covered_Recipient_Type,2
2,Recipient_Primary_Business_Street_Address_Line1,6501
3,Recipient_City,1874
4,Recipient_State,52
5,Recipient_Zip_Code,3983
6,Recipient_Country,1
7,Submitting_Applicable_Manufacturer_or_Applicab...,465
8,Applicable_Manufacturer_or_Applicable_GPO_Maki...,520
9,Applicable_Manufacturer_or_Applicable_GPO_Maki...,525


In [102]:
unique_counts.sort_values(by=["Num_Unique"],inplace=True)

In [103]:
unique_counts

Unnamed: 0,Column_Name,Num_Unique
22,Payment_Publication_Date,1
17,Delay_in_Publication_Indicator,1
6,Recipient_Country,1
21,Program_Year,1
1,Covered_Recipient_Type,2
19,Dispute_Status_for_Publication,2
16,Preclinical_Research_Indicator,2
12,Related_Product_Indicator,2
15,Form_of_Payment_or_Transfer_of_Value,3
0,Change_Type,4


In [104]:
cols_to_exclude = ['Program_Year', 'Payment_Publication_Date', 'Date_of_Payment']

In [105]:
for col in df.columns:
    if df[col].nunique() < 600 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')

In [106]:
df.memory_usage(deep=True).sum() / (1024*1024)

51.826568603515625

### Performance

Perform the analysis on the original input dataframe.

In [107]:
%%timeit
df_raw.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

19.2 ms ± 1.38 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


Now, on the dataframe with categorical data:

In [108]:
%%timeit
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

2.2 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [109]:
19.2/2.2

8.727272727272727

### Watch Outs

> The real problem is that programmers have spent far too much time worrying about efficiency in the wrong places and at the wrong times; premature optimization is the root of all evil (or at least most of it) in programming.

### General Guidelines


1. Do not assume you need to convert all categorical data to the pandas category data type.
2. If the data set starts to approach an appreciable percentage of your useable memory, then consider using categorical data types.
3. If you have very significant performance concerns with operations that are executed frequently, look at using categorical data.
4. If you are using categorical data, add some checks to make sure the data is clean and complete before converting to the pandas category type. Additionally, check for NaN values after combining or converting dataframes.
