In [3]:
import pandas as pd
import numpy as np

# Categorical Data

Pripravimo datasete:

In [1]:
!tar -xJf data/data_del_02.tar.xz -C ./data/

- [Categorical data](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html)
- [Using The Pandas Category Data Type](https://pbpython.com/pandas_dtypes_cat.html)
- [Use Categorical Data to Save on Time and Space](https://realpython.com/python-pandas-tricks/#5-use-categorical-data-to-save-on-time-and-space)

## Background and Motivation

In [4]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [5]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [6]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [7]:
values.value_counts()

apple     6
orange    2
dtype: int64

In [8]:
values = pd.Series([0, 1, 0, 0] * 2)

In [9]:
dim = pd.Series(['apple', 'orange'])

In [10]:
dim

0     apple
1    orange
dtype: object

In [11]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

> [pandas.Series.take](https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.Series.take.html)

## Categorical Type in pandas

In [12]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [13]:
N = len(fruits)

In [14]:
df = pd.DataFrame({'fruit': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(3, 15, size=N),
    'weight': np.random.uniform(0, 4, size=N)},
    columns=['basket_id', 'fruit', 'count', 'weight'])

In [15]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,12,0.924198
1,1,orange,6,3.873906
2,2,apple,8,3.491774
3,3,apple,11,1.593882
4,4,apple,4,1.728148
5,5,orange,10,0.028533
6,6,apple,7,0.18878
7,7,apple,13,0.405279


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   basket_id  8 non-null      int64  
 1   fruit      8 non-null      object 
 2   count      8 non-null      int64  
 3   weight     8 non-null      float64
dtypes: float64(1), int64(2), object(1)
memory usage: 384.0+ bytes


In [18]:
df['fruit'] = df['fruit'].astype('category')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   basket_id  8 non-null      int64   
 1   fruit      8 non-null      category
 2   count      8 non-null      int64   
 3   weight     8 non-null      float64 
dtypes: category(1), float64(1), int64(2)
memory usage: 424.0 bytes


In [21]:
fruit_cat = df['fruit']

In [22]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [24]:
fruit_cat.values

[apple, orange, apple, apple, apple, orange, apple, apple]
Categories (2, object): [apple, orange]

In [25]:
c = fruit_cat.values
type(c)

pandas.core.arrays.categorical.Categorical

In [26]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [27]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

> Notice that the dtype is NumPy’s int8, an 8-bit signed integer that can take on values from -127 to 128. (Only a single byte is needed to represent a value in memory. 64-bit signed ints would be overkill in terms of memory usage.) Our rough-hewn example resulted in int64 data by default, whereas Pandas is smart enough to downcast categorical data to the smallest numerical dtype possible.

In [28]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [29]:
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

## Better performance with categoricals

In [30]:
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [32]:
labels.memory_usage(deep = True)/1024/1024

686.6456298828125

In [33]:
categories = labels.astype('category')

In [34]:
categories.memory_usage(deep = True)/1024/1024

9.53729248046875

## Categorical Methods

In [36]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)

In [44]:
colors = pd.Series(['periwinkle', 'mint green', 'burnt orange',
                     'periwinkle', 'burnt orange', 'rose', 
                     'rose', 'mint green', 'rose', 'navy'])

ccolors = colors.astype('category')

In [None]:
try:
     ccolors.iloc[5] = 'a new color'
except ValueError as e:
    print(e)

## Example: Using The Pandas Category Data Type

### Data Preparation

First, set up imports and read in all the data:

In [56]:
df_raw = pd.read_csv('data/category_example_data.csv')

In [58]:
df_raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 176 entries, Change_Type to Context_of_Research
dtypes: float64(43), int64(3), object(130)
memory usage: 566.0 MB


> [pandas.DataFrame.from_records](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html)

In [66]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                                         columns=['Column_Name', 'Num_Unique'])

In [72]:
cols_to_exclude = ['Program_Year', 'Payment_Publication_Date', 'Date_of_Payment']

In [73]:
for col in df.columns:
    if df[col].nunique() < 600 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')

In [79]:
df.memory_usage(deep=True).sum() / (1024*1024)

51.83827590942383

### Performance

Perform the analysis on the original input dataframe.

In [81]:
%%timeit
df_raw.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

9.91 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Now, on the dataframe with categorical data:

In [82]:
%%timeit
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

1.91 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Watch Outs

> The real problem is that programmers have spent far too much time worrying about efficiency in the wrong places and at the wrong times; premature optimization is the root of all evil (or at least most of it) in programming.

### General Guidelines


1. Do not assume you need to convert all categorical data to the pandas category data type.
2. If the data set starts to approach an appreciable percentage of your useable memory, then consider using categorical data types.
3. If you have very significant performance concerns with operations that are executed frequently, look at using categorical data.
4. If you are using categorical data, add some checks to make sure the data is clean and complete before converting to the pandas category type. Additionally, check for NaN values after combining or converting dataframes.
