In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# html을 이용하여 가로방향으로 여러 DF을 인쇄
class disp(object):
    template = '<div style="float: left;padding:10px;"> <b>[{0}]</b> {1}</div>'
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args )

import pandas as pd
import seaborn as sns   

### [예제1] category Type의 이해

In [None]:
# [1] 

df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                   'names':['kim','lee','park','song','lew']}).set_index('names')

df['major'] = df['major'].astype('category')
r1 = df['major']
r2 = df['major'].cat.codes

r1;r2

In [None]:
# [2] 
df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                   'names':['kim','lee','park','song','lew']}).set_index('names')

r3 = df['major'].repeat(10)
r4 = df['major'].repeat(10).astype('category')

print(r3.dtype, r3.memory_usage(deep=True))
print(r4.dtype, r4.memory_usage(deep=True))    

### [예제2] categorical 데이터 추가, 수정

In [None]:
# [1-1]

df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                   'names':['kim','lee','park','song','lew']}).set_index('names')

df['major'] = df['major'].astype('category')
r1 = df['major'].cat.set_categories(['eng', 'kor'])
r2 = r1.cat.codes
r1;r2

In [None]:
# [1-2]

df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                   'names':['kim','lee','park','song','lew']}).set_index('names')

df['major'] = df['major'].astype('category')
r3 = df['major'].cat.set_categories(['eng','kor','math','art'])
r4 = df['major'].cat.reorder_categories(['math','kor','eng'])
r3;r4

In [None]:
# [2] 

df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                   'names':['kim','lee','park','song','lew']}).set_index('names')

df['major'] = df['major'].astype('category')
df['major'].cat.categories = ['MJ1', 'MJ2', 'MJ3'] 
r5 = df['major'].cat.categories

r5; df

### [예제3] categorical 데이터 재정렬 및 비교 연산

In [None]:
df = pd.DataFrame({'major':['kor', 'eng','eng','kor', 'math'],
                   'exam':[90,40,100,70,60],
                    'names':['kim','lee','park','song','lew']}).set_index('names')

li = ['math', 'eng','kor']
df['major'] = df['major'].astype('category')
cat1 = df['major'].cat.set_categories(li)
cat2 = df['major'].cat.set_categories(li,ordered=True)
r1 = cat1.sort_values()
r2 = cat1.max() #error
r3 = cat2.max() 
r4 = cat2 > 'eng'

cat1; cat2; r1; r3; r4

### [예제4] Categorical 객체 생성

In [None]:
# [1-1] 

data = ['A', 'B', 'C', 'B']
cat = pd.Categorical(data, categories=['C','A','B'], ordered=True)
sr1 = pd.Series(cat)
r1 = sr1.sort_values()
    
sr1; r1

In [None]:
# [1-2] 

data = ['A', 'B', 'C', 'B']
cat = pd.Categorical(data, categories=['C','A','B'], ordered=True)
sr2 = pd.Series(data, dtype='category')
r2 = sr2.sort_values()
    
sr2; r2

### [예제5] cut()메서드

In [None]:
df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,87,55,77,100]}).set_index('names')

li = ['bad','good','best']
df['bins'] = pd.cut( df['exam'], bins=3)
df['eval'] = pd.cut( df['exam'], bins=3, labels=li)
r1 = df['bins'].cat.codes
r2 = df['bins'].cat.categories

r1;r2;df

### [예제6] cut()메서드 활용 예

In [None]:
# [1-1]
df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,87,55,77,100]}).set_index('names')

l = list('FDCBA')
b = [-1,59,69,79,89,100]
df['eval'] = pd.cut(df['exam'], bins=b, labels=l)
r1 = df['eval']
r1; df

In [None]:
# [1-2]
df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,87,55,77,100]}).set_index('names')

l = list('FDCBA')
b = [-1,59,69,79,89,100]
df['eval'] = pd.cut(df['exam'], bins=b, labels=l)
r2 = df.sort_values('eval')
disp('df', 'r2')

In [None]:
# [1-3]
df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,87,55,77,100]}).set_index('names')

l = list('FDCBA')
b = [-1,59,69,79,89,100]
df['eval'] = pd.cut(df['exam'], bins=b, labels=l)
r3 = df.groupby('eval').count()
disp('df', 'r3')

In [None]:
# [1-4]

df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,87,55,77,100]}).set_index('names')

l = list('FDCBA')
b = [-1,59,69,79,89,100]
df['eval'] = pd.cut(df['exam'], bins=b, labels=l)
df['eval'] = df['eval'].cat.set_categories(l[::-1])
r4 = df.sort_values('eval')

disp('df', 'r4')

### [예제7] qcut() 메서드

In [None]:
df = pd.DataFrame({'names':['kim','lee','park','song','lew'], 
                   'exam':[95,80,55,77,100]}).set_index('names')

li = ['bad','good','best']
df['bins'] = pd.qcut(df['exam'], 3)
df['eval'] = pd.qcut(df['exam'], 3, labels=li)
r = df['bins'].cat.categories
    
r; df

### [예제8] closstab() 메서드

In [None]:
df = pd.DataFrame({'grade':[1,2,1,2,1], 'class':list('AABAA'),'sex':list('MMWWM'),
                  'names':['kim','lee','park','song','lew']}).set_index('names')

r1 = pd.crosstab(index=df['grade'], columns=df['class']) 
r2 = pd.crosstab(df['grade'], [df['class'], df['sex']])  
r3=pd.crosstab(df['grade'],columns=df['class'],margins=True)

disp('df', 'r1','r2','r3')

### [예제 9] Category 활용 : titanic  

In [None]:
df = sns.load_dataset('titanic')[['class', 'survived', 'age']]
df.dropna(inplace=True)
df.info()
df.head()

In [None]:
# [1] 

l = ['child', 'junior', 'adult', 'senior']
b = [0, 12, 18, 60, 100]
cut = pd.cut(df['age'], bins=b, labels=l)
r1 = df['age'].groupby(cut).agg(['count', 'min', 'max'])

r1

In [None]:
# [2]

cut2 = pd.cut(df['age'], bins=range(0,101,10), labels=range(0,100,10))
r2 = df.groupby([cut2, 'survived'])['survived'].count().unstack(0)

r2