In [1]:
import pandas as pd
import numpy as np

# Categorical Data

Pripravimo datasete:

In [2]:
!tar -xJf data/data_del_02.tar.xz -C ./data/

- [Categorical data](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html)
- [Using The Pandas Category Data Type](https://pbpython.com/pandas_dtypes_cat.html)
- [Use Categorical Data to Save on Time and Space](https://realpython.com/python-pandas-tricks/#5-use-categorical-data-to-save-on-time-and-space)

## Background and Motivation

In [3]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [4]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [5]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [6]:
values.value_counts()

apple     6
orange    2
dtype: int64

In [7]:
values = pd.Series([0, 1, 0, 0] * 2)

In [8]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [9]:
dim = pd.Series(['apple', 'orange'])

In [10]:
dim

0     apple
1    orange
dtype: object

> [pandas.Series.take](https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.Series.take.html)

In [11]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

## Categorical Type in pandas

In [12]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [13]:
N = len(fruits)

In [14]:
df = pd.DataFrame({'fruit': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(3, 15, size=N),
    'weight': np.random.uniform(0, 4, size=N)},
    columns=['basket_id', 'fruit', 'count', 'weight'])

In [15]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,10,2.036838
1,1,orange,4,3.643356
2,2,apple,6,1.619392
3,3,apple,3,1.497276
4,4,apple,14,0.615497
5,5,orange,8,2.880261
6,6,apple,10,2.069376
7,7,apple,3,2.893858


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
basket_id    8 non-null int64
fruit        8 non-null object
count        8 non-null int64
weight       8 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 384.0+ bytes


In [17]:
df['fruit']= df['fruit'].astype('category')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
basket_id    8 non-null int64
fruit        8 non-null category
count        8 non-null int64
weight       8 non-null float64
dtypes: category(1), float64(1), int64(2)
memory usage: 424.0 bytes


In [19]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,10,2.036838
1,1,orange,4,3.643356
2,2,apple,6,1.619392
3,3,apple,3,1.497276
4,4,apple,14,0.615497
5,5,orange,8,2.880261
6,6,apple,10,2.069376
7,7,apple,3,2.893858


In [20]:
fruit_cat = df['fruit']

In [21]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [22]:
fruit_cat.values

[apple, orange, apple, apple, apple, orange, apple, apple]
Categories (2, object): [apple, orange]

In [23]:
c = fruit_cat.values
type(c)

pandas.core.arrays.categorical.Categorical

In [24]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [25]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

> Notice that the dtype is NumPy’s int8, an 8-bit signed integer that can take on values from -127 to 128. (Only a single byte is needed to represent a value in memory. 64-bit signed ints would be overkill in terms of memory usage.) Our rough-hewn example resulted in int64 data by default, whereas Pandas is smart enough to downcast categorical data to the smallest numerical dtype possible.

In [26]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [27]:
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

## Better performance with categoricals

In [32]:
N = 10_000_000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [36]:
labels.memory_usage(deep=True)/1024/1024

686.6456298828125

In [41]:
%%timeit
categories = labels.astype('category')

1.5 s ± 22.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
categories.head()

0    foo
1    bar
2    baz
3    qux
4    foo
dtype: category
Categories (4, object): [bar, baz, foo, qux]

In [39]:
categories.memory_usage(deep=True)/1024/1024

9.53729248046875

## Categorical Methods

In [28]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)

In [29]:
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: object

In [34]:
cat_s = s.astype('category')

In [35]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [36]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [33]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [37]:
cat_s3 = cat_s[cat_s.isin(['a', 'b'])]

In [38]:
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [39]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): [a, b]

In [40]:
colors = pd.Series(['periwinkle', 'mint green', 'burnt orange',
                     'periwinkle', 'burnt orange', 'rose', 
                     'rose', 'mint green', 'rose', 'navy'])

ccolors = colors.astype('category')

In [41]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (5, object): [burnt orange, mint green, navy, periwinkle, rose]

In [43]:
#ccolors.iloc[5] = 'nova barva'

In [45]:
try:
    ccolors.iloc[5] = 'a new color'
except ValueError as e:
    print(e)

Cannot setitem on a Categorical with a new category, set the categories first


In [46]:
ccolors = ccolors.cat.add_categories(['nova barva'])

In [47]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5            rose
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): [burnt orange, mint green, navy, periwinkle, rose, nova barva]

In [48]:
ccolors.iloc[5] = 'nova barva'

In [49]:
ccolors

0      periwinkle
1      mint green
2    burnt orange
3      periwinkle
4    burnt orange
5      nova barva
6            rose
7      mint green
8            rose
9            navy
dtype: category
Categories (6, object): [burnt orange, mint green, navy, periwinkle, rose, nova barva]

## Example: Using The Pandas Category Data Type

### Data Preparation

First, set up imports and read in all the data:

In [51]:
df_raw = pd.read_csv('data/category_example_data.csv', low_memory=False)

In [52]:
df_raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 566.0 MB


In [54]:
df_raw.head(3)

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,...,Preclinical_Research_Indicator,Delay_in_Publication_Indicator,Name_of_Study,Dispute_Status_for_Publication,Record_ID,Program_Year,Payment_Publication_Date,ClinicalTrials_Gov_Identifier,Research_Information_Link,Context_of_Research
0,UNCHANGED,Covered Recipient Teaching Hospital,,110079.0,5085.0,GRADY MEMORIAL HOSPITAL,,,,,...,No,No,PCYC-1134M-CA,No,493381041,2017,06/28/2019,,,informCLL A Disease Registry for Patients with...
1,UNCHANGED,Covered Recipient Teaching Hospital,,520078.0,5350.0,ST. FRANCIS HOSPITAL,,,,,...,No,No,Dimethyl Fumarate (DMF) Observational Study,No,455805444,2017,06/28/2019,,,
2,UNCHANGED,Covered Recipient Physician,,,,,296787.0,BERNARD,N,STULBERG,...,No,No,COC,No,501931627,2017,06/28/2019,,,


In [55]:
drop_tresh = df_raw.shape[0]*0.9

In [56]:
drop_tresh

89999.1

In [73]:
df = df_raw.dropna(axis='columns', how='all', thresh=drop_tresh).copy()

In [64]:
#df.nunique()

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html

In [74]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns], 
                         columns=['column_name', 'num_unique'])

In [75]:
unique_counts.sort_values(by=['num_unique'], inplace=True)

In [76]:
unique_counts

Unnamed: 0,column_name,num_unique
22,Payment_Publication_Date,1
17,Delay_in_Publication_Indicator,1
6,Recipient_Country,1
21,Program_Year,1
1,Covered_Recipient_Type,2
19,Dispute_Status_for_Publication,2
16,Preclinical_Research_Indicator,2
12,Related_Product_Indicator,2
15,Form_of_Payment_or_Transfer_of_Value,3
0,Change_Type,4


> [pandas.DataFrame.from_records](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html)

In [77]:
cols_to_exclude = ['Program_Year', 'Payment_Publication_Date', 'Date_of_Payment']

In [78]:
for col in df.columns:
    if df[col].nunique() < 600 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')

In [79]:
df.memory_usage(deep=True).sum() / (1024*1024) # v MB

51.838321685791016

### Performance

Perform the analysis on the original input dataframe.

In [80]:
%%timeit
df_raw.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

31.9 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Now, on the dataframe with categorical data:

In [81]:
%%timeit
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

4.19 ms ± 545 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Watch Outs

> The real problem is that programmers have spent far too much time worrying about efficiency in the wrong places and at the wrong times; premature optimization is the root of all evil (or at least most of it) in programming.

### General Guidelines


1. Do not assume you need to convert all categorical data to the pandas category data type.
2. If the data set starts to approach an appreciable percentage of your useable memory, then consider using categorical data types.
3. If you have very significant performance concerns with operations that are executed frequently, look at using categorical data.
4. If you are using categorical data, add some checks to make sure the data is clean and complete before converting to the pandas category type. Additionally, check for NaN values after combining or converting dataframes.
