# using categoricals

In [1]:
pd.set_option('max_rows',8)

# in memory

In [2]:
df_object = DataFrame({'B' : 
                       Series(['a','foo','bar',
                               'a really long string','baz'])})
df_cat = df_object.copy()
df_cat['B'] = df_cat['B'].astype('category')
df_object = pd.concat([df_object]*100000,ignore_index=True)
df_cat = pd.concat([df_cat]*100000,ignore_index=True)
df_object

Unnamed: 0,B
0,a
1,foo
2,bar
3,a really long string
...,...
499996,foo
499997,bar
499998,a really long string
499999,baz


In [3]:
df_object.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 1 columns):
B    500000 non-null object
dtypes: object(1)
memory usage: 7.6+ MB


In [4]:
def as_mb(v):
    return "%.1f MB" % (v/(1024.0*1024))

In [5]:
# what pandas reports on direct usage
as_mb(df_object.memory_usage().B)

'3.8 MB'

In [6]:
# what python actually uses
import sys
as_mb(sum(map(sys.getsizeof,df_object['B'].values)))

'20.5 MB'

In [7]:
as_mb(sum(map(sys.getsizeof,df_object['B'].values)) + df_object.memory_usage().B)

'24.3 MB'

In [8]:
for i in range(3):
    print i, sys.getsizeof(df_object['B'].values[i])

0 38
1 40
2 40


In [9]:
# approx fixed-len string storage
as_mb(df_object['B'].values.astype(str).nbytes)

'9.5 MB'

In [10]:
as_mb(df_cat.memory_usage().B)

'0.5 MB'

In [11]:
df_cat.B.cat.categories.nbytes

40

In [12]:
df_cat.B.cat.categories

Index([u'a', u'a really long string', u'bar', u'baz', u'foo'], dtype='object')

In [13]:
df_cat.B.cat.codes

0         0
1         4
2         2
3         1
         ..
499996    4
499997    2
499998    1
499999    3
dtype: int8

# on disk

In [14]:
df_object.to_hdf('data/test_object.h5','df',mode='w',data_columns=True,format='table')
df_cat.to_hdf('data/test_cat.h5','df',mode='w',data_columns=True,format='table')
!ls -ltr data/*.h5


-rw-rw-r--  1 jreback  staff  14740709 Sep 29 06:42 data/test_object.h5
-rw-rw-r--  1 jreback  staff   5281722 Sep 29 06:42 data/test_cat.h5


In [15]:
with pd.get_store('data/test_cat.h5') as store:
    print store

<class 'pandas.io.pytables.HDFStore'>
File path: data/test_cat.h5
/df                        frame_table  (typ->appendable,nrows->500000,ncols->1,indexers->[index],dc->[B])
/df/meta/B/meta            series_table (typ->appendable,nrows->5,ncols->1,indexers->[index],dc->[values])
