In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<IPython.core.display.Javascript object>

From the book Python for Data Analysis by Wes McKinney

# Categorical Data

In [20]:
!pip install nb_black
# %load_ext nb_black
%load_ext lab_black

[0mThe nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [21]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

<IPython.core.display.Javascript object>

In [22]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

<IPython.core.display.Javascript object>

In [23]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

<IPython.core.display.Javascript object>

In [24]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

<IPython.core.display.Javascript object>

In [25]:
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

<IPython.core.display.Javascript object>

`take()` allows us to represent data as numbers. 

> This representation as integers is called the **categorical** or <mark>**dictionary-encoded** representation.</mark>
> - The array of distinct values can be called the *categories*, *dictionary*, or *levels* of the data.
> - The integer values that reference the categories are called the *category codes* or simply *codes*.

You use it basically to <mark>reverse the OneHotEncoding!</mark>

In [26]:
index = pd.Index(np.random.randint(0, 1000, 10))
positions = [0, 9, 3]
index[positions]

Int64Index([434, 190, 556], dtype='int64')

<IPython.core.display.Javascript object>

In [27]:
index.take(positions)

Int64Index([434, 190, 556], dtype='int64')

<IPython.core.display.Javascript object>

In [28]:
ser = pd.Series(np.random.randn(10))
ser.iloc[positions]

0   -0.975574
9   -1.821779
3    1.342212
dtype: float64

<IPython.core.display.Javascript object>

In [29]:
ser.take(positions)

0   -0.975574
9   -1.821779
3    1.342212
dtype: float64

<IPython.core.display.Javascript object>

In [30]:
frm = pd.DataFrame(np.random.randn(5, 3))
frm.take([1, 4, 3])

Unnamed: 0,0,1,2
1,-0.590949,0.080178,-0.46946
4,-0.226309,-0.074113,-0.906428
3,-2.110028,-0.049818,1.185263


<IPython.core.display.Javascript object>

> Takes axis = 0 by default

In [31]:
frm.take([0, 2], axis=1)

Unnamed: 0,0,2
0,0.271451,1.417847
1,-0.590949,-0.46946
2,0.47917,1.553735
3,-2.110028,1.185263
4,-0.226309,-0.906428


<IPython.core.display.Javascript object>

 > take method on pandas objects are not intended to work on boolean indices

> because the take method handles a narrower range of inputs, it can offer performance that is a good deal <mark>faster than fancy indexing.</mark>

In [32]:
arr = np.random.randn(10000, 5)
ser = pd.Series(arr[:, 0])
indexer = np.arange(10000)

%timeit arr[indexer]
%timeit arr.take(indexer, axis=0)

240 µs ± 909 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
57 µs ± 465 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

In [33]:
%timeit ser.iloc[indexer]
%timeit ser.take(indexer)

206 µs ± 2.87 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
190 µs ± 2.88 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


<IPython.core.display.Javascript object>

## Categorical Type

Pandas has a special *Categorical* type for holding data that uses the integer-based categorical representation or encoding.

In [88]:
fruits = ["apple", "orange", "apple", "apple"] * 2
N = len(fruits)
df = pd.DataFrame(
    {
        "fruit": fruits,
        "basket_id": np.arange(N),
        "count": np.random.randint(3, 15, size=N),
        "weight": np.random.uniform(0, 4, size=N),
    },
    columns=["basket_id", "fruit", "count", "weight"],
)
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,4,2.674358
1,1,orange,8,1.8572
2,2,apple,14,2.161995
3,3,apple,3,1.276314
4,4,apple,7,2.957047
5,5,orange,11,0.626831
6,6,apple,8,3.124508
7,7,apple,13,2.866171


<IPython.core.display.Javascript object>

In [36]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

<IPython.core.display.Javascript object>

In [37]:
type(fruit_cat), type(fruit_cat.values)

(pandas.core.series.Series, pandas.core.arrays.categorical.Categorical)

<IPython.core.display.Javascript object>

The values for `fruit_cat` are not a `pandas.Series`, but an instance of `pandas.Categorical`!!!

In [38]:
c = fruit_cat.values
c.categories, c.codes

(Index(['apple', 'orange'], dtype='object'),
 array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8))

<IPython.core.display.Javascript object>

In [39]:
fruit_cat.values

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

<IPython.core.display.Javascript object>

In [40]:
fruit_cat.values.categories, fruit_cat.values.codes

(Index(['apple', 'orange'], dtype='object'),
 array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8))

<IPython.core.display.Javascript object>

In [89]:
list(fruit_cat.values), list(fruit_cat.values.categories), list(fruit_cat.values.codes)

(['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple'],
 ['apple', 'orange'],
 [0, 1, 0, 0, 0, 1, 0, 0])

<IPython.core.display.Javascript object>

NOT a dict 

In [42]:
# Results in an error
# dict(fruit_cat.values)

<IPython.core.display.Javascript object>

In [90]:
my_categories = pd.Categorical(["foo", "bar", "baz", "foo", "bar"])
type(my_categories)

pandas.core.arrays.categorical.Categorical

<IPython.core.display.Javascript object>

In [44]:
my_categories.categories, my_categories.codes

(Index(['bar', 'baz', 'foo'], dtype='object'),
 array([2, 0, 1, 2, 0], dtype=int8))

<IPython.core.display.Javascript object>

In [45]:
# my_categories.categories.values, my_categories.codes.values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

<IPython.core.display.Javascript object>

In [91]:
categories = ["foo", "bar", "baz"]

codes = [0, 1, 2, 0, 0, 1]

my_cats_2 = pd.Categorical.from_codes(codes, categories)

my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

<IPython.core.display.Javascript object>

In [47]:
np.random.seed(12345)
draws = np.random.randn(1000)
sr = pd.Series(draws)
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

<IPython.core.display.Javascript object>

`pd.cut()` results in index based 

In [61]:
cutting = pd.cut(draws, 4)
cutting
# type(cutting)

[(-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], ..., (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], (0.489, 2.208]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

<IPython.core.display.Javascript object>

In [62]:
cutting = pd.cut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
cutting
# type(cutting)

['Q2', 'Q2', 'Q2', 'Q2', 'Q3', ..., 'Q2', 'Q2', 'Q2', 'Q3', 'Q3']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

<IPython.core.display.Javascript object>

In [55]:
cutting2 = pd.cut(sr, 4)
cutting2
# type(cutting2)

0      (-1.23, 0.489]
1      (-1.23, 0.489]
2      (-1.23, 0.489]
3      (-1.23, 0.489]
4      (0.489, 2.208]
            ...      
995    (-1.23, 0.489]
996    (-1.23, 0.489]
997    (-1.23, 0.489]
998    (0.489, 2.208]
999    (0.489, 2.208]
Length: 1000, dtype: category
Categories (4, interval[float64, right]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

<IPython.core.display.Javascript object>

In [63]:
cutting2 = pd.cut(sr, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
cutting2
# type(cutting2)

0      Q2
1      Q2
2      Q2
3      Q2
4      Q3
       ..
995    Q2
996    Q2
997    Q2
998    Q3
999    Q3
Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

<IPython.core.display.Javascript object>

In [56]:
bins = pd.qcut(draws, 4)
bins

[(-0.684, -0.0101], (-0.0101, 0.63], (-0.684, -0.0101], (-0.684, -0.0101], (0.63, 3.928], ..., (-0.0101, 0.63], (-0.684, -0.0101], (-2.9499999999999997, -0.684], (-0.0101, 0.63], (0.63, 3.928]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]

<IPython.core.display.Javascript object>

In [57]:
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
bins

['Q2', 'Q3', 'Q2', 'Q2', 'Q4', ..., 'Q3', 'Q2', 'Q1', 'Q3', 'Q4']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

<IPython.core.display.Javascript object>

In [64]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

<IPython.core.display.Javascript object>

In [58]:
bins2 = pd.qcut(sr, 4)
bins2

0                  (-0.684, -0.0101]
1                    (-0.0101, 0.63]
2                  (-0.684, -0.0101]
3                  (-0.684, -0.0101]
4                      (0.63, 3.928]
                   ...              
995                  (-0.0101, 0.63]
996                (-0.684, -0.0101]
997    (-2.9499999999999997, -0.684]
998                  (-0.0101, 0.63]
999                    (0.63, 3.928]
Length: 1000, dtype: category
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]

<IPython.core.display.Javascript object>

In [92]:
bins2 = pd.qcut(sr, 4, labels=["Q1", "Q2", "Q3", "Q4"])
bins2

0      Q2
1      Q3
2      Q2
3      Q2
4      Q4
       ..
995    Q3
996    Q2
997    Q1
998    Q3
999    Q4
Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

<IPython.core.display.Javascript object>

In [74]:
bins = pd.Series(bins, name='quartile')
results = pd.Series(draws).groupby(bins).agg(['count', 'max', 'min']).reset_index()
results

Unnamed: 0,quartile,count,max,min
0,Q1,250,-0.685484,-2.949343
1,Q2,250,-0.010115,-0.683066
2,Q3,250,0.628894,-0.010032
3,Q4,250,3.927528,0.634238


<IPython.core.display.Javascript object>

In [78]:
results['quartile'].dtype

CategoricalDtype(categories=['Q1', 'Q2', 'Q3', 'Q4'], ordered=True)

<IPython.core.display.Javascript object>

## Performance Benefits!!!

Analytics on categorical datasets can be <mark>much faster and use lower memory</mark>!

In [93]:
N = 10000000
# creating 10 million (normally distributed) random numbers
draws = pd.Series(np.random.randn(N))
labels = pd.Series(["foo", "bar", "baz", "qux"] * (N // 4))
categories = labels.astype("category")
labels.memory_usage() / categories.memory_usage()

7.999747208392681

<IPython.core.display.Javascript object>

> Categories use 8x less memory in this case!

# Categorical Methods

Special methods for categorical data

In [81]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

<IPython.core.display.Javascript object>

The special attribute `cat` provides access to these methods

In [82]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

<IPython.core.display.Javascript object>

In [85]:
cat_s.values.codes

array([0, 1, 2, 3, 0, 1, 2, 3], dtype=int8)

<IPython.core.display.Javascript object>

> This returns a numpy array verses the `cat` returns another pandas series

In [86]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

<IPython.core.display.Javascript object>