In [2]:
import pandas as pd
import numpy as np

# Categorical Data

## Background and Motivation

In [3]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [4]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [6]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [7]:
values.value_counts()

apple     6
orange    2
dtype: int64

In [8]:
values = pd.Series([0, 1, 0, 0] * 2)

In [9]:
dim = pd.Series(['apple', 'orange'])

In [10]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [11]:
dim

0     apple
1    orange
dtype: object

In [12]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

> [pandas.Series.take](https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.Series.take.html)

## Categorical Type in pandas

In [13]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [14]:
N = len(fruits)

In [15]:
df = pd.DataFrame({'fruit': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(3, 15, size=N),
    'weight': np.random.uniform(0, 4, size=N)},
    columns=['basket_id', 'fruit', 'count', 'weight'])

In [16]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,4,1.309839
1,1,orange,14,1.098704
2,2,apple,13,3.677736
3,3,apple,5,0.440409
4,4,apple,14,0.424254
5,5,orange,13,0.436886
6,6,apple,11,0.489769
7,7,apple,6,0.059229


In [17]:
fruit_cat = df["fruit"].astype("category")

In [18]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [20]:
c = fruit_cat.values

In [21]:
c

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [22]:
type(c)

pandas.core.arrays.categorical.Categorical

In [23]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [24]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

> Notice that the dtype is NumPy’s int8, an 8-bit signed integer that can take on values from -127 to 128. (Only a single byte is needed to represent a value in memory. 64-bit signed ints would be overkill in terms of memory usage.) Our rough-hewn example resulted in int64 data by default, whereas Pandas is smart enough to downcast categorical data to the smallest numerical dtype possible.

In [25]:
df["fruit"] = fruit_cat = df["fruit"].astype("category")

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   basket_id  8 non-null      int64   
 1   fruit      8 non-null      category
 2   count      8 non-null      int64   
 3   weight     8 non-null      float64 
dtypes: category(1), float64(1), int64(2)
memory usage: 452.0 bytes


In [27]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [28]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

## Better performance with categoricals

In [29]:
N = 10_000_000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [32]:
labels.head(10)

0    foo
1    bar
2    baz
3    qux
4    foo
5    bar
6    baz
7    qux
8    foo
9    bar
dtype: object

In [34]:
labels.memory_usage(deep=True) / 1024 /1024

572.2047119140625

In [35]:
categories = labels.astype("category")

In [36]:
categories.memory_usage(deep=True) / 1024 /1024

9.53725814819336

In [37]:
%timeit  _ = labels.astype("category")

1.02 s ± 296 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Categorical Methods

In [41]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)

In [42]:
cat_s = s.astype("category")

In [43]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [44]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [45]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [46]:
cat_s2 = cat_s[cat_s.isin(["a", "b"])]

In [47]:
cat_s2

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [48]:
cat_s2.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

In [49]:
colors = pd.Series(['periwinkle', 'mint green', 'burnt orange',
                     'periwinkle', 'burnt orange', 'rose', 
                     'rose', 'mint green', 'rose', 'navy'])

ccolors = colors.astype('category')

In [50]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (5, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose']

In [51]:
try:
     ccolors.iloc[5] = 'a new color'
except ValueError as e:
    print(e)

Cannot setitem on a Categorical with a new category, set the categories first


In [52]:
ccolors = ccolors.cat.add_categories(['a new color'])

In [54]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose', 'a new color']

In [55]:
ccolors.iloc[5] = 'a new color'

In [56]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5     a new color
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): ['burnt orange', 'mint green', 'navy', 'periwinkle', 'rose', 'a new color']

## Example: Using The Pandas Category Data Type

### Data Preparation

First, set up imports and read in all the data:

In [58]:
df_raw = pd.read_csv('data/category_example_data.csv', low_memory=False)

In [58]:
df_raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 566.0 MB


In [60]:
df_raw.head(3)

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,...,Preclinical_Research_Indicator,Delay_in_Publication_Indicator,Name_of_Study,Dispute_Status_for_Publication,Record_ID,Program_Year,Payment_Publication_Date,ClinicalTrials_Gov_Identifier,Research_Information_Link,Context_of_Research
0,UNCHANGED,Covered Recipient Teaching Hospital,,110079.0,5085.0,GRADY MEMORIAL HOSPITAL,,,,,...,No,No,PCYC-1134M-CA,No,493381041,2017,06/28/2019,,,informCLL A Disease Registry for Patients with...
1,UNCHANGED,Covered Recipient Teaching Hospital,,520078.0,5350.0,ST. FRANCIS HOSPITAL,,,,,...,No,No,Dimethyl Fumarate (DMF) Observational Study,No,455805444,2017,06/28/2019,,,
2,UNCHANGED,Covered Recipient Physician,,,,,296787.0,BERNARD,N,STULBERG,...,No,No,COC,No,501931627,2017,06/28/2019,,,


In [61]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 134.3+ MB


In [66]:
drop_tresh = int(df_raw.shape[0]*0.9)

In [67]:
df = df_raw.dropna(thresh=drop_tresh, how="all", axis="columns").copy()

In [68]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 23 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Change_Type                                                       99999 non-null  object 
 1   Covered_Recipient_Type                                            99999 non-null  object 
 2   Recipient_Primary_Business_Street_Address_Line1                   99999 non-null  object 
 3   Recipient_City                                                    99999 non-null  object 
 4   Recipient_State                                                   99999 non-null  object 
 5   Recipient_Zip_Code                                                99999 non-null  object 
 6   Recipient_Country                                                 99999 non-null  object 
 7   Submitting_Applicable_Manufactu

> [pandas.DataFrame.from_records](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html)

In [69]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                                         columns=['Column_Name', 'Num_Unique'])

In [75]:
#[(col, df[col].nunique()) for col in df.columns]

In [76]:
unique_counts.sort_values(by=["Num_Unique"], inplace=True)
unique_counts

Unnamed: 0,Column_Name,Num_Unique
22,Payment_Publication_Date,1
17,Delay_in_Publication_Indicator,1
6,Recipient_Country,1
21,Program_Year,1
1,Covered_Recipient_Type,2
19,Dispute_Status_for_Publication,2
16,Preclinical_Research_Indicator,2
12,Related_Product_Indicator,2
15,Form_of_Payment_or_Transfer_of_Value,3
0,Change_Type,4


In [78]:
#df.head(3)

In [79]:
cols_to_exclude = ['Program_Year', 'Payment_Publication_Date', 'Date_of_Payment']

In [80]:
for col in df.columns:
    if df[col].nunique() < 600 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')

In [85]:
df.memory_usage(deep=True).sum() / (1024*1024)

51.826568603515625

#### Funkcija za pretvarjanje category tipa

In [86]:
df_raw = pd.read_csv('data/category_example_data.csv', low_memory=False)
df = df_raw.dropna(thresh=drop_tresh, how="all", axis="columns").copy()

In [96]:
from typing import List

def transform_df_to_category(df, max_unique_procent: int, cols_to_exclude: List[str]):
    obj_df = df.select_dtypes(include="object")
    columns_to_category = obj_df.columns
    
    total_values = df.shape[0]
    treshold = total_values * (max_unique_procent/100)
    for col in columns_to_category:
        if df[col].nunique() < treshold and col not in cols_to_exclude:
            df[col] = df[col].astype('category')
        
    return df

In [97]:
cols_to_exclude = ['Payment_Publication_Date', 'Date_of_Payment']
data = transform_df_to_category(df, 20, cols_to_exclude)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 23 columns):
 #   Column                                                            Non-Null Count  Dtype   
---  ------                                                            --------------  -----   
 0   Change_Type                                                       99999 non-null  category
 1   Covered_Recipient_Type                                            99999 non-null  category
 2   Recipient_Primary_Business_Street_Address_Line1                   99999 non-null  category
 3   Recipient_City                                                    99999 non-null  category
 4   Recipient_State                                                   99999 non-null  category
 5   Recipient_Zip_Code                                                99999 non-null  category
 6   Recipient_Country                                                 99999 non-null  category
 7   Submitting_Applicable_

### Performance

Perform the analysis on the original input dataframe.

In [98]:
%%timeit
df_raw.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

25.1 ms ± 61.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Now, on the dataframe with categorical data:

In [99]:
%%timeit
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

1.48 ms ± 8.97 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Watch Outs

> The real problem is that programmers have spent far too much time worrying about efficiency in the wrong places and at the wrong times; premature optimization is the root of all evil (or at least most of it) in programming.

### General Guidelines


1. Do not assume you need to convert all categorical data to the pandas category data type.
2. If the data set starts to approach an appreciable percentage of your useable memory, then consider using categorical data types.
3. If you have very significant performance concerns with operations that are executed frequently, look at using categorical data.
4. If you are using categorical data, add some checks to make sure the data is clean and complete before converting to the pandas category type. Additionally, check for NaN values after combining or converting dataframes.
