In [1]:
import numpy as np
import pandas as pd

import datetime
from datetime import datetime, date

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

import matplotlib.pyplot as plt
%matplotlib inline

# Creating Categoricals

In [2]:
# create a categorical directly from a list

lmh_values = ['low','high','medium','medium','high']
lmh_cat = pd.Categorical(lmh_values) # sorted by alphabet
lmh_cat

[low, high, medium, medium, high]
Categories (3, object): [high, low, medium]

In [4]:
# return the categories

lmh_cat.categories
# index로 나온다 - location 적용 가능

Index(['high', 'low', 'medium'], dtype='object')

In [6]:
# .codes: shows the integer mapping for each value of the categorical - 수치화
# 각각의 위치값을 확인가능

lmh_cat.codes

array([1, 0, 2, 2, 0], dtype=int8)

In [9]:
# create from list but explicity state the categories

lmh_cat1 = pd.Categorical(lmh_values,
                         categories=['low','medium','high']) # 이 순서에 맞춰서 만들자(아까는 알파벳순)
lmh_cat1

[low, high, medium, medium, high]
Categories (3, object): [low, medium, high]

In [10]:
lmh_cat1.codes

array([0, 2, 1, 1, 2], dtype=int8)

In [11]:
# sorting is done using the codes underlying each value
lmh_cat1.sort_values()

[low, medium, medium, high, high]
Categories (3, object): [low, medium, high]

In [12]:
lmh_cat.sort_values()

[high, high, low, medium, medium]
Categories (3, object): [high, low, medium]

In [13]:
# create a categorical using a Series and dtype

cat_series = pd.Series(lmh_values, dtype='category')
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [14]:
# create a categorical using .astype()

s = pd.Series(lmh_values)
as_cat = s.astype('category')
as_cat

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [15]:
# series에다가는 .cat 속성을 추가해서 확인하면 가능하다
cat_series.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x0000025444B0B3D0>

In [16]:
# 짠!
cat_series.cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [17]:
# 시리즈에 바로 categories 속성은 불가하다
cat_series.categories

AttributeError: 'Series' object has no attribute 'categories'

In [18]:
# create a DF of 100 values

np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Values':values})
bins

   Values
0      65
1      49
2      56
3      43
4      43

In [20]:
# cut the values into
bins['Group'] = pd.cut(values, range(0, 101, 10)) # cut(): 특정 범위에 포함된 객체 구간을 만든다. 순서를 지정한다
bins

   Values     Group
0      65  (60, 70]
1      49  (40, 50]
2      56  (50, 60]
3      43  (40, 50]
4      43  (40, 50]

In [21]:
# examine the categorical that was created
bins.Group

0    (60, 70]
1    (40, 50]
2    (50, 60]
3    (40, 50]
4    (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [22]:
# create an ordered categorical of precious metals
# order is important for determining relative values

metal_values = ['bronze','gold','silver','bronze']
metal_categories = ['bronze','silver','gold']
metals = pd.Categorical(metal_values,
                       categories=metal_categories,
                       ordered = True)
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [25]:
metal_values1 = ['bronze','silver','gold','bronze']
metal_categories1 = ['bronze','silver','gold']
metals_r = pd.Categorical(metal_values1,
                       categories=metal_categories1,
                       ordered = True)
metals_r #카테고리 코드값이 변경됨

[bronze, silver, gold, bronze]
Categories (3, object): [bronze < silver < gold]

In [26]:
metals <= metals_r

array([ True, False,  True,  True])

In [27]:
metals.codes

array([0, 2, 1, 0], dtype=int8)

In [28]:
metals_r.codes

array([0, 1, 2, 0], dtype=int8)

In [30]:
# creating a categorical with a non existent category

pd.Categorical(['bronze','cooper'],
              categories=metal_categories) # 범주에 속하지 않은 값을 제외할 때 NaN 사용

[bronze, NaN]
Categories (3, object): [bronze, silver, gold]

# Renaming Categories

In [31]:
# create a categorical with 3 categories

cat = pd.Categorical(['a','b','c','a'],
                    categories=['a','b','c'])
cat

[a, b, c, a]
Categories (3, object): [a, b, c]

In [32]:
# rename the categories (and also the valules)

cat.categories = ['bronze','silver','gold']
cat

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [33]:
cat

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [34]:
# this also renames

cat.rename_categories(['x','y','z'])

[x, y, z, x]
Categories (3, object): [x, y, z]

In [36]:
# inplace 적용 안됨
cat

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [37]:
cat.rename_categories(['x','y','z'],
                     inplace=True)

In [38]:
cat

[x, y, z, x]
Categories (3, object): [x, y, z]

# Appending new categories

In [39]:
# add a new platinum category

with_platinum = metals.add_categories(['platinum'])
with_platinum

[bronze, gold, silver, bronze]
Categories (4, object): [bronze < silver < gold < platinum]

# Removing Categories

In [40]:
# remove bronze category

no_bronze = metals.remove_categories(['bronze'])
no_bronze

[NaN, gold, silver, NaN]
Categories (2, object): [silver < gold]

In [41]:
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [48]:
metals.remove_categories(['bronze'])
# 원본에 영향을 주지는 않는다

[NaN, gold, silver, NaN]
Categories (2, object): [silver < gold]

In [47]:
metals.categories

Index(['bronze', 'silver', 'gold'], dtype='object')

# Removing unused categories

In [49]:
# remove any unused categories

with_platinum.remove_unused_categories() # 미사용 범주 삭제

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

# Setting categories

In [50]:
# sample Series

s = pd.Series(['one','two','four','five'], dtype='category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): [five, four, one, two]

In [51]:
# remove the 'two', 'three',' five' categories

s = s.cat.set_categories(['one','four'])
s

0     one
1     NaN
2    four
3     NaN
dtype: category
Categories (2, object): [one, four]

# Describe

In [52]:
metals.describe()

            counts  freqs
categories               
bronze           2   0.50
silver           1   0.25
gold             1   0.25

In [53]:
s.describe()

count        2
unique       2
top       four
freq         1
dtype: object

# Value counts

In [54]:
metals.value_counts()

bronze    2
silver    1
gold      1
dtype: int64

In [56]:
s.value_counts()

four    1
one     1
dtype: int64

# Minimum, Maximum and mode

In [57]:
(metals.min(), metals.max(), metals.mode())

('bronze',
 'gold',
 [bronze]
 Categories (3, object): [bronze < silver < gold])

# Munging school grades

In [59]:
# 10 students with random grades
np.random.seed(123456)
names = ['Ivana','Norris','Ruth','Lane','Skye','Sol',
        'Dylan','Katina','Alissa','Marc']
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name':names, 'Grade':grades})
scores

     Name  Grade
0   Ivana     51
1  Norris     92
2    Ruth    100
3    Lane     99
4    Skye     93
5     Sol     97
6   Dylan     93
7  Katina     77
8  Alissa     82
9    Marc     73

In [60]:
# bins and their mappings to letter grades
score_bins = [0, 59, 62, 66, 69, 72, 76, 79, 82, 86, 89, 92, 99, 100]
letter_grades = ['F','D-','D','D+','C-','C','C+','B-','B','B+','A-','A','A+']

In [61]:
letter_cats = pd.cut(scores.Grade, score_bins, labels=letter_grades)
scores['Letter'] = letter_cats
scores

     Name  Grade Letter
0   Ivana     51      F
1  Norris     92     A-
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
7  Katina     77     C+
8  Alissa     82     B-
9    Marc     73      C

In [62]:
letter_cats

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Grade, dtype: category
Categories (13, object): [F < D- < D < D+ ... B+ < A- < A < A+]

In [63]:
letter_cats.value_counts()

A     4
A+    1
A-    1
B-    1
C+    1
     ..
B     0
C-    0
D+    0
D     0
D-    0
Name: Grade, Length: 13, dtype: int64

In [64]:
scores.Letter.value_counts()

A     4
A+    1
A-    1
B-    1
C+    1
     ..
B     0
C-    0
D+    0
D     0
D-    0
Name: Letter, Length: 13, dtype: int64

In [65]:
scores.sort_values(by=['Letter'], ascending=False)

     Name  Grade Letter
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
1  Norris     92     A-
8  Alissa     82     B-
7  Katina     77     C+
9    Marc     73      C
0   Ivana     51      F