In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

class disp(object):
    template = '<div style="float: left;padding:10px;"> <b>[{0}]</b> {1}</div>'
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join( self.template.format(a, eval(a)._repr_html_()) 
                        for a in self.args)

import pandas as pd
import numpy as np

### [예제1] 조건에 맞는 데이터 indexing

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','song','lew'], 'sex':list('WMWMW')})

In [None]:
# [1]

df1 = df[[False, True, False, True, False]]
disp('df', 'df1')

In [None]:
# [2]

df2 = df[ df['sex'] == 'M' ]
print(df['sex'] == 'M')
disp('df', 'df2')

In [None]:
# [3]

df3 = df.loc[ lambda x:x['sex']=='M' ]
disp('df', 'df3')

### [예제2] Boolean vector : list, ndarray

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','song'], 'sex':list('WMWM'),
                  'age':[20, 40, 35, 25]})
df

In [None]:
# [1] 

row = [False, True, False, True]
df1 = df[row]
sr1 = df['name'][row]
disp('df', 'df1'); sr1

In [None]:
# [2] 

row = [False, True, False, True]
df2 = df.loc[row]
sr2 = df.loc[row, 'name']
sr3 = df.iloc[row, 0]
disp('df', 'df2'); sr2; sr3

In [None]:
# [3] 

row = [False, True, False, True]
col = np.array([True, False, True])
df3 = df.loc[:, col]
df4 = df.loc[row, col]
disp('df', 'df3', 'df4')

### [예제3] Boolean vector : Series, DataFrame 

In [None]:
df = pd.DataFrame({'name':['kim','lee','park'], 'sex':list('WMW'),
                  'age':[20, 40, 30]}).set_index('name')
df

In [None]:
# [1-1] 

barr = [True,False,True]
mask1 = pd.Series(barr, index=['kim','lee','park'])
df1 = df[mask1]
sr1 = df['sex'][mask1]
mask1; disp('df', 'df1'); sr1

In [None]:
# [1-2] 

barr = [True,False,True]
mask2 = pd.Series(barr, index=['park','lee','kim'])
sr2 = df['sex'][mask2]
df; mask2; sr2

In [None]:
# [1-3]

barr = [True, False, True]
mask3 = pd.Series(barr, index = ['kim','lee','song'])
sr3 = df['sex'][mask3] #error    

In [None]:
# [2]

barr2 = [[True, False], [False, True], [True, True]]
idx, col = ['kim', 'lee', 'song'], ['sex','age']
mask4 = pd.DataFrame(barr2,index=idx, columns=col)
df2 = df[mask4]      
disp('df', 'mask4', 'df2')

### [예제4] DataFrame / Series의 비교 및 논리연산

In [None]:
df = pd.DataFrame({'sex':list('WMWM'),'kor':[70,80,100,50], 'eng':[80,30,60,90]}
                 , index=['kim','lee','park','song'])
df

In [None]:
# [1-1] 

sr1 = df['kor'] >= 80
sr2 = df['kor'].ge(df['kor'].mean())
df; sr1; sr2

In [None]:
# [1-2]

sr3 = df['sex'].eq('W')
sr4 = ~(df['sex']=='M')   
df; sr3; sr4

In [None]:
# [1-3] 

df5 = df[['kor', 'eng']] >=80
df6 = df >= 80  #error
disp('df', 'df5')   

In [None]:
# [2] 

sr7 = (df['sex']=='W') & (df['kor']>=80)
sr8 = (df['sex']=='W') and (df['kor']>=80) #error   
df; sr7

### [예제5]  Boolean Vector : 조건식

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','song'], 'sex':list('WMWM'),
                  'age':[20, 40, 35, 30]}).set_index('name')
df

In [None]:
# [1]

mask = df['sex']=='W'
df1 = df[mask]
df2 = df.loc[mask]
df3 = df.iloc[mask] #error
df4 = df.iloc[mask.to_list()]
mask; disp('df', 'df1', 'df2', 'df4')

In [None]:
# [2]

df5 = df[df.index=='park']
disp('df', 'df5')

In [None]:
# [3]
df = pd.DataFrame({'name':['kim','lee','park','song'], 'sex':list('WMWM'),
                  'age':[20, 40, 35, 30]}).set_index('name')

df.loc[df['age']>30, 'age2'] = 'high'
df.loc[df['age']<=30, 'age2'] = 'low'  
df6 = df.loc[df['age2']=='high']

disp('df', 'df6')

### [예제6] 사용자 함수에 의한 Indexing

In [None]:
# [1-1]

df = pd.DataFrame({'name':['kim','lee','park','song'],'kor':[80,30,50,90],
                  'eng':[90,100,70,50]}).set_index('name')
df

def f1(x): 
    print(type(x))
    print(x)
    return 'kor'

v1 = df.loc['kim'][f1]
v1

In [None]:
# [1-2]

df = pd.DataFrame({'name':['kim','lee','park','song'],'kor':[80,30,50,90],
                  'eng':[90,100,70,50]}).set_index('name')
df

def f1(x): 
    print(type(x))
    print(x)
    return 'kor'

sr1 = df[f1]
sr1

In [None]:
# [2] 

df = pd.DataFrame({'name':['kim','lee','park','song'],'kor':[80,30,50,90],
                  'eng':[90,100,70,50]}).set_index('name')

def f2(x) :
    x['avg'] = (x['kor'] + x['eng'] )//2
    return x['avg'] >= 70

df1 = df[f2] 
disp('df', 'df1')

### [예제7] Series의 .str accessor

In [None]:
df = pd.DataFrame({'name':['kim','lee','ki','song'], 'sex':list('WMWM')})

mask = df['name'].str.contains('a|i') 
df1 = df[ mask ]  
df2 = df[df['name'].str.endswith(('m','i'))]
df3 = df[df['name'].str.startswith('k')]
mask; disp('df', 'df1', 'df2', 'df3')

### [예제8] 조건에 따른 필터링과 집계

In [None]:
name = ['kim','lee','park','song','lew']
ban = list('AABBA')
exam = [80,70,90,50,100]
df = pd.DataFrame({'class':ban,'name':name,'exam':exam})
df

In [None]:
# [1]

r1 = df.loc[ df['class']=='A', 'name' ].count()
r1

In [None]:
# [2]

r2 = df.loc[df['exam']>=60,'class'].value_counts()
r2

In [None]:
# [3]

r3 = df[df['class']=='A'].nlargest(1,['exam'])   
r3

In [None]:
# [4]

def func(x):
    x['name'] = x['name'].str.upper()
    return x['name'].str.startswith(('K', 'P'))  

r4 = df.loc[func, ['name','exam']]
r4

### [예제9] all(), any() 메서드

In [None]:
# [1]

mask = [[True, True, False],[True, False, False],[True, False, False]]
df = pd.DataFrame(mask, columns=['A','B','C'])
sr1 = df.all(axis = 0)
sr2 = df.all(axis = 1)
df; sr1; sr2

In [None]:
# [2]

mask = [[True, True, False],[True, False, False],[True, False, False]]
df = pd.DataFrame(mask, columns=['A','B','C'])
sr3 = df.any(axis = 0)
sr4 = df.any(axis = 1)  
df; sr3; sr4

### [예제10] isin() 메서드 1

In [None]:
df = pd.DataFrame({'name':['kim','lee','park'],
                   'kor':[100, 50, 80],
                  'eng':[70, 100, 90]}).set_index('name')

In [None]:
m1 = df.isin([50, 100])
df1 = df[m1]
disp('df', 'm1', 'df1')

In [None]:
m2 = df.isin(100) #error
m3 = df.isin([100])
m4 = df['kor'].isin([100])
m3; m4

### [예제11] isin() 메서드 2

In [None]:
df = pd.DataFrame({'name':['kim','lee','park'], 'sex':list('MWM'),
                   'age':[30,40,50]}).set_index('name')

In [None]:
# [1-1]

m1 = df.isin(['M', 30, 40])
m2 = df.isin({'sex':['M'],'age':[30,40]})    
disp('df', 'm1', 'm2')

In [None]:
# [1-2]

m3 = df.isin(pd.Series([50,40,30], index=df.index))
m4 = df.isin(pd.DataFrame({'sex':'M','age':30}, index=df.index))
disp('df', 'm3', 'm4')

In [None]:
# [2-1]

m4 = df.isin(pd.DataFrame({'sex':'M','age':30}, index=df.index))
m5 = m4.any(axis=1)
df1 = df[m5]
m4; m5; disp('df', 'df1')

In [None]:
# [2-2]

m4 = df.isin(pd.DataFrame({'sex':'M','age':30}, index=df.index))
m6 = m4.all(axis=1)
df2 = df[m6]
m4; m6; disp('df', 'df2')

### [예제12] isin() 메서드 활용

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','song'], 
                   'sex':list('WMWM'),
                   'age':[30,40,40,50],
                  'id':[1,2,3,4]}).set_index('name')

In [None]:
# [1]

m1 = (df['sex']=='M') | (df['age']>=40)
m2 = ~(df['sex']=='W') & (df['age']>=40)
df1, df2 = df[m1], df[m2]
m1; m2; disp('df', 'df1', 'df2')

In [None]:
# [2-1]

dic = {'sex':['M'],'age':[40]}
m3 = df.isin(dic).any(axis=1)
df3 = df[m3]
m3; disp('df','df3')

In [None]:
# [2-2]

dic = {'sex':['M'],'age':[40]}
m4 = df.isin(dic).all(axis=1) #?
df4 = df[m4]
m4; disp('df', 'df4')

In [None]:
# [2-3]

dic = {'sex':['M'],'age':[40]}
m5 = df[['sex','age']].isin(dic).all(axis=1)
df5 = df[m5]
m5; disp('df', 'df5')

### [예제13] duplicated() 메서드 이해 1

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','kim'],
                   'id':[10, 15, 20, 10]})

In [None]:
# [1]

m1 = df.duplicated()
df1 = df[m1]
m1; disp('df', 'df1')

In [None]:
# [2]

m2 = ~df.duplicated()
df2 = df[m2]
m2; disp('df', 'df2')

### [예제14] duplicated() 메서드 이해 2

In [None]:
df = pd.DataFrame({'name':['kim','lee','park','kim'],
                   'id':[10, 15, 20, 10]})

In [None]:
# [1]

m1 = df.duplicated(keep = 'last')
df1 = df[~m1]
m1; disp('df', 'df1')

In [None]:
# [2]

m2 = df.duplicated(keep = False)
df2 = df[~m2]
m2; disp('df', 'df2')

In [None]:
# [3]

m3 = df.duplicated(['name'])
m4 = df['name'].duplicated()
df3, df4 = df[~m3], df[~m4]
m3; m4; disp('df', 'df3', 'df4')

In [None]:
# [4]
df = pd.DataFrame({'name':['kim','lee','park','kim'],
                   'id':[10, 15, 20, 10]})

df = df.set_index('name')
m5 = df.index.duplicated()
df5 = df[~m5]
m5; disp('df', 'df5')

### [예제15] 조건에 따른 필터링 활용

In [None]:
grade=[1,1,2,1,2]
names = ['kim','lee','park','song','lew']
major = ['math','kor','com','kor','eng']
exam = [80, np.nan, 90, 100,np.nan]
d = {'grade':grade,'major':major,'exam':exam}
df = pd.DataFrame(d,index=names)
df

In [None]:
# [1]

r1 = df[df['major'].isin(['kor', 'math'])]
r1

In [None]:
# [2]

dic = {'grade':[1],'major':['kor', 'math']}
r2 = df[ df[['grade','major']].isin(dic).all(axis=1) ]
r2

In [None]:
# [3]

r3 = df[ df['exam'].isna()]
r4 = df[ df['exam'].isin([np.nan])]
r3; r4

In [None]:
# [4]

df.loc[df['exam'].isna(), 'YN'] = 'NO'
df.loc[df['exam'].notna(), 'YN'] = 'YES'
df