In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

From the book Python for Data Analysis by Wes McKinney

# Categorical Data

In [3]:
!pip install nb_black
%load_ext nb_black

Collecting nb_black
  Downloading nb_black-1.0.7.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: nb_black
  Building wheel for nb_black (setup.py) ... [?25ldone
[?25h  Created wheel for nb_black: filename=nb_black-1.0.7-py3-none-any.whl size=5297 sha256=711ad917cda9a0d75515b20ebcd9747a10feb7adb542fa071e63bf86629dd4d1
  Stored in directory: /root/.cache/pip/wheels/1e/b2/88/51c66d23ea5fd0d40ed50997555e15d981d92671376a9a412a
Successfully built nb_black
Installing collected packages: nb_black
Successfully installed nb_black-1.0.7
[0m

<IPython.core.display.Javascript object>

In [4]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

<IPython.core.display.Javascript object>

In [5]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

<IPython.core.display.Javascript object>

In [6]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

<IPython.core.display.Javascript object>

In [7]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

<IPython.core.display.Javascript object>

In [8]:
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

<IPython.core.display.Javascript object>

`take()` allows us to represent data as numbers. 

> This representation as integers is called the **categorical** or <mark>**dictionary-encoded** representation.</mark>
> - The array of distinct values can be called the *categories*, *dictionary*, or *levels* of the data.
> - The integer values that reference the categories are called the *category codes* or simply *codes*.

You use it basically to <mark>reverse the OneHotEncoding!</mark>

In [9]:
index = pd.Index(np.random.randint(0, 1000, 10))
positions = [0, 9, 3]
index[positions]

Int64Index([446, 109, 487], dtype='int64')

<IPython.core.display.Javascript object>

In [10]:
index.take(positions)

Int64Index([446, 109, 487], dtype='int64')

<IPython.core.display.Javascript object>

In [11]:
ser = pd.Series(np.random.randn(10))
ser.iloc[positions]

0   -1.006410
9    0.529015
3    0.074626
dtype: float64

<IPython.core.display.Javascript object>

In [12]:
ser.take(positions)

0   -1.006410
9    0.529015
3    0.074626
dtype: float64

<IPython.core.display.Javascript object>

In [13]:
frm = pd.DataFrame(np.random.randn(5, 3))
frm.take([1, 4, 3])

Unnamed: 0,0,1,2
1,0.63314,-1.039391,1.053186
4,0.297873,0.385345,0.336166
3,1.212665,0.057515,-0.390696


<IPython.core.display.Javascript object>

> Takes axis = 0 by default

In [14]:
frm.take([0, 2], axis=1)

Unnamed: 0,0,2
0,-0.769637,-0.093117
1,0.63314,1.053186
2,0.676914,-0.145641
3,1.212665,-0.390696
4,0.297873,0.336166


<IPython.core.display.Javascript object>

 > take method on pandas objects are not intended to work on boolean indices

> because the take method handles a narrower range of inputs, it can offer performance that is a good deal <mark>faster than fancy indexing.</mark>

In [15]:
arr = np.random.randn(10000, 5)
ser = pd.Series(arr[:, 0])
indexer = np.arange(10000)

%timeit arr[indexer]
%timeit arr.take(indexer, axis=0)

242 µs ± 1.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
56.7 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

In [16]:
%timeit ser.iloc[indexer]
%timeit ser.take(indexer)

200 µs ± 2.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
188 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

## Categorical Type

Pandas has a special *Categorical* type for holding data that uses the integer-based categorical representation or encoding.

In [17]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                    'basket_id': np.arange(N),
                    'count': np.random.randint(3, 15, size=N),
                    'weight': np.random.uniform(0, 4, size=N)},
                    columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,4,0.989799
1,1,orange,3,3.342612
2,2,apple,8,1.15035
3,3,apple,13,1.453112
4,4,apple,5,2.381374
5,5,orange,13,2.305634
6,6,apple,4,3.037967
7,7,apple,4,1.071959


<IPython.core.display.Javascript object>

In [18]:
type(fruit_cat), type(fruit_cat.values)

NameError: name 'fruit_cat' is not defined

<IPython.core.display.Javascript object>

In [None]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

The values for `fruit_cat` are not a `pandas.Series`, but an instance of `pandas.Categorical`!!!

In [None]:
c = fruit_cat.values
c.categories, c.codes

In [None]:
fruit_cat.values

In [None]:
fruit_cat.values.categories, fruit_cat.values.codes

In [None]:
list(fruit_cat.values), list(fruit_cat.values.categories), list(fruit_cat.values.codes)

NOT a dict 

In [None]:
# Results in an error
# dict(fruit_cat.values)

In [None]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
type(my_categories)

In [None]:
my_categories.categories, my_categories.codes

In [None]:
my_categories.categories.values, my_categories.codes.values

In [None]:
categories = ['foo', 'bar', 'baz']

codes = [0, 1, 2, 0, 0, 1]

my_cats_2 = pd.Categorical.from_codes(codes, categories)

my_cats_2