# Ch.07 데이터 준비하기: 다듬기, 변형, 병합

# 7.1 데이터 합치기

- pd.merge(on = key name, how = 'outer'/'inner'/'left'/'right', suffixes = ['_from df1', '_from df2']) : 하나 이상의 키(공통되는 열의 이름)를 따라 행을 이어 붙인다.
    + outer join : 합집합
    + inner join : 교집합
    + left join : 왼쪽 로우를 포함하는 결과
    + right join : 오른쪽 로우를 포함하는 결과
- pd.join() : 색인으로 머지할 때 사용. 열이 겹치지 않으며 완전히 같거나 유사한 색인 구조를 가진 여러개의 df 객체 병합 시 사용
- pd.concat : 하나의 축을 따라 객체를 이어 붙인다.


# 7.1.1 데이터베이스 스타일로 DataFrame 합치기

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame({'key1': list('bbacaab'), 'key2': list('ddefgge'), 'data1': range(7)})
df2 = pd.DataFrame({'key1': list('abc'), 'key2': list('ggf'), 'data2': range(3)})

In [3]:
df1

Unnamed: 0,key1,key2,data1
0,b,d,0
1,b,d,1
2,a,e,2
3,c,f,3
4,a,g,4
5,a,g,5
6,b,e,6


In [4]:
df2

Unnamed: 0,key1,key2,data2
0,a,g,0
1,b,g,1
2,c,f,2


df1과 df2의 공통되는 열은 key1, key2.

key1과 key2에서 공통 데이터는

| key1 | key2 |
| --- | --- |
| c | f |
| a | g |

In [5]:
# default : how = 'inner'
pd.merge(df1, df2)

Unnamed: 0,key1,key2,data1,data2
0,c,f,3,2
1,a,g,4,0
2,a,g,5,0


In [6]:
# how = 'outer'
pd.merge(df1, df2, how ='outer')

Unnamed: 0,key1,key2,data1,data2
0,b,d,0.0,
1,b,d,1.0,
2,a,e,2.0,
3,c,f,3.0,2.0
4,a,g,4.0,0.0
5,a,g,5.0,0.0
6,b,e,6.0,
7,b,g,,1.0


In [7]:
df1

Unnamed: 0,key1,key2,data1
0,b,d,0
1,b,d,1
2,a,e,2
3,c,f,3
4,a,g,4
5,a,g,5
6,b,e,6


In [8]:
df2

Unnamed: 0,key1,key2,data2
0,a,g,0
1,b,g,1
2,c,f,2


In [9]:
#key2_x from df1, key2_y from df2
#key2_x 는 df1의 key2 데이터, key2_y는 df2의 key2 데이터
pd.merge(df1, df2, on='key1')

Unnamed: 0,key1,key2_x,data1,key2_y,data2
0,b,d,0,g,1
1,b,d,1,g,1
2,b,e,6,g,1
3,a,e,2,g,0
4,a,g,4,g,0
5,a,g,5,g,0
6,c,f,3,f,2


In [10]:
#set name instead _x, _y
#겹치는 이름의 열이지만 join key로 지정되지 못한 경우 
##자동으로 _x, _y로 쓰이는 대신 이름을 지정

pd.merge(df1, df2, on='key1', suffixes=['_df1', '_df2'])

Unnamed: 0,key1,key2_df1,data1,key2_df2,data2
0,b,d,0,g,1
1,b,d,1,g,1
2,b,e,6,g,1
3,a,e,2,g,0
4,a,g,4,g,0
5,a,g,5,g,0
6,c,f,3,f,2


공통된 key가 없는 경우 직접 지정한다.
- left_on, right_on 옵션

In [11]:
df3 = pd.DataFrame({'Lkey': list('bbacaab'), 'data1': range(7)})
df4 = pd.DataFrame({'Rkey': list('abd'), 'data2': range(3)})

In [12]:
df3

Unnamed: 0,Lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [13]:
df4

Unnamed: 0,Rkey,data2
0,a,0
1,b,1
2,d,2


In [14]:
#there's no common column
#공통되는 컬럼이 없다.

pd.merge(df3, df4, left_on = 'Lkey', right_on = 'Rkey')

Unnamed: 0,Lkey,data1,Rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [15]:
# pd.merge(df3, df4, left_on = 'Rkey', right_on = 'Lkey') >> error

In [16]:
pd.merge(df4, df3, left_on = 'Rkey', right_on = 'Lkey') 

Unnamed: 0,Rkey,data2,Lkey,data1
0,a,0,a,2
1,a,0,a,4
2,a,0,a,5
3,b,1,b,0
4,b,1,b,1
5,b,1,b,6


In [17]:
df1

Unnamed: 0,key1,key2,data1
0,b,d,0
1,b,d,1
2,a,e,2
3,c,f,3
4,a,g,4
5,a,g,5
6,b,e,6


In [18]:
df2

Unnamed: 0,key1,key2,data2
0,a,g,0
1,b,g,1
2,c,f,2


In [19]:
pd.merge(df1, df2, how = 'left')

Unnamed: 0,key1,key2,data1,data2
0,b,d,0,
1,b,d,1,
2,a,e,2,
3,c,f,3,2.0
4,a,g,4,0.0
5,a,g,5,0.0
6,b,e,6,


In [20]:
pd.merge(df1, df2, how = 'right')

Unnamed: 0,key1,key2,data1,data2
0,c,f,3.0,2
1,a,g,4.0,0
2,a,g,5.0,0
3,b,g,,1


# 7.1.2 Index Merge
left_index, right_index 옵션 사용(True / False)

In [21]:
left1 = pd.DataFrame({'key': list('abaabc'), 'value': range(6)})
right1 = pd.DataFrame({'group_val':[3,7]}, index = ['a','b'])

In [22]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [23]:
right1

Unnamed: 0,group_val
a,3
b,7


In [24]:
#pd.merge(left1, right1, left_on ='key', right_on = index) >> error

In [25]:
pd.merge(left1, right1, left_on = 'key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3
2,a,2,3
3,a,3,3
1,b,1,7
4,b,4,7


데이터가 계층색인(중복 색인)인 경우

In [26]:
lefth = pd.DataFrame({'key1' : ['ohio', 'ohio', 'ohio', 'Nevada', 'Nevada'],
                     'key2' : [2001, 2001, 2002, 2001, 2002],
                     'data' : np.arange(5)})
righth = pd.DataFrame(np.arange(12).reshape(6,2),
                      index = [['Nevada', 'Nevada', 'ohio', 'ohio','ohio','ohio'],
                              [2001,2000, 2000, 2000,2001, 2002]],
                      columns = ['event1', 'event2'])

In [27]:
lefth

Unnamed: 0,key1,key2,data
0,ohio,2001,0
1,ohio,2001,1
2,ohio,2002,2
3,Nevada,2001,3
4,Nevada,2002,4


In [28]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
ohio,2000,4,5
ohio,2000,6,7
ohio,2001,8,9
ohio,2002,10,11


In [29]:
lefth.index

RangeIndex(start=0, stop=5, step=1)

In [30]:
righth.index

MultiIndex([('Nevada', 2001),
            ('Nevada', 2000),
            (  'ohio', 2000),
            (  'ohio', 2000),
            (  'ohio', 2001),
            (  'ohio', 2002)],
           )

In [31]:
#['key1', 'key2'] is the key from 'lefth' dataframe
## indices are the key in 'righth' dataframe 
#좌측 데이터프레임에서는 left_on에 해당하는 열을 key로 사용하고
##우측 데이터프레임에서는 right_index로 인덱스를 key로 사용하여 둘을 병합한다.
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index =True)

Unnamed: 0,key1,key2,data,event1,event2
0,ohio,2001,0,8,9
1,ohio,2001,1,8,9
2,ohio,2002,2,10,11
3,Nevada,2001,3,0,1


In [32]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index =True, how ='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,ohio,2001,0.0,8.0,9.0
1,ohio,2001,1.0,8.0,9.0
2,ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0
4,ohio,2000,,4.0,5.0
4,ohio,2000,,6.0,7.0


In [33]:
# index to column
# 인덱스를 열로 바꾸기
right_re = righth.reset_index(level=0)
right_re

Unnamed: 0,level_0,event1,event2
2001,Nevada,0,1
2000,Nevada,2,3
2000,ohio,4,5
2000,ohio,6,7
2001,ohio,8,9
2002,ohio,10,11


In [34]:
pd.merge(lefth, right_re, left_on = 'key2', right_index = True)

Unnamed: 0,key1,key2,data,level_0,event1,event2
0,ohio,2001,0,Nevada,0,1
0,ohio,2001,0,ohio,8,9
1,ohio,2001,1,Nevada,0,1
1,ohio,2001,1,ohio,8,9
3,Nevada,2001,3,Nevada,0,1
3,Nevada,2001,3,ohio,8,9
2,ohio,2002,2,ohio,10,11
4,Nevada,2002,4,ohio,10,11


In [35]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [36]:
right1

Unnamed: 0,group_val
a,3
b,7


In [37]:
left1.join(right1)

Unnamed: 0,key,value,group_val
0,a,0,
1,b,1,
2,a,2,
3,a,3,
4,b,4,
5,c,5,


In [38]:
left1.join(right1, how = 'outer')

Unnamed: 0,key,value,group_val
0,a,0.0,
1,b,1.0,
2,a,2.0,
3,a,3.0,
4,b,4.0,
5,c,5.0,
a,,,3.0
b,,,7.0


In [39]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.0
1,b,1,7.0
2,a,2,3.0
3,a,3,3.0
4,b,4,7.0
5,c,5,


# 7.1.3 축 따라 이어붙이기
- np.concatenate 와 유사
- pd.concat(): 색인 대 색인으로 데이터프레임을 간단히 병합
    + axis
    + join = 'inner'/'outer'/'left'/'right
    + reindex

In [40]:
arr = np.arange(12).reshape(6,2)
arr

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [41]:
np.concatenate([arr,arr,arr], axis=1)

array([[ 0,  1,  0,  1,  0,  1],
       [ 2,  3,  2,  3,  2,  3],
       [ 4,  5,  4,  5,  4,  5],
       [ 6,  7,  6,  7,  6,  7],
       [ 8,  9,  8,  9,  8,  9],
       [10, 11, 10, 11, 10, 11]])

In [42]:
np.concatenate([arr,arr,arr], axis=0)

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [43]:
s1 = pd.Series([0,1], index = ['a','b'])
s2 = pd.Series([2,3,4], index = ['c','d','e'])
s3 = pd.Series([5,6], index = ['f','g'])
s4 = pd.Series([0,5,5,6], index = ['a','b','f','g'])

In [44]:
s1

a    0
b    1
dtype: int64

In [45]:
s2

c    2
d    3
e    4
dtype: int64

In [46]:
s3

f    5
g    6
dtype: int64

In [47]:
s4

a    0
b    5
f    5
g    6
dtype: int64

In [48]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [49]:
pd.concat([s1,s2,s3], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [50]:
pd.concat([s1,s2,s3], axis=1, sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [51]:
pd.concat([s1,s2,s3], join='inner')

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [52]:
s1

a    0
b    1
dtype: int64

In [53]:
s4

a    0
b    5
f    5
g    6
dtype: int64

In [54]:
pd.concat([s1,s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [55]:
d1 = pd.DataFrame({'col1' : [100,200,300]}, index = ['a','b','c'])
d2 = pd.DataFrame({'col2' : [2,3,1]}, index = ['b','c','a'])

In [56]:
d1

Unnamed: 0,col1
a,100
b,200
c,300


In [57]:
d2

Unnamed: 0,col2
b,2
c,3
a,1


In [58]:
pd.concat([d1,d2], axis=1, sort=False)

Unnamed: 0,col1,col2
a,100,1
b,200,2
c,300,3


In [59]:
pd.merge(d1,d2, left_index = True, right_index=True)

Unnamed: 0,col1,col2
a,100,1
b,200,2
c,300,3


In [60]:
s1

a    0
b    1
dtype: int64

In [61]:
s4

a    0
b    5
f    5
g    6
dtype: int64

In [62]:
pd.concat([s1,s4], axis=1, sort=False).reindex(['a','c','b','e'])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,5.0
e,,


pd.concat(keys = []) 

In [63]:
result = pd.concat([s1,s2,s3], axis=0, keys =['one', 'two', 'three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [64]:
pd.concat([s1,s2,s3], axis=1, sort=False, keys=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [65]:
daf1 = pd.DataFrame(np.arange(6).reshape(3,2), index = list('abc'), columns = 'one,two'.split(','))
daf2 = pd.DataFrame(np.arange(12,16).reshape(2,2), index = list('ac'), columns = 'three,four'.split(','))

In [66]:
daf1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [67]:
daf2

Unnamed: 0,three,four
a,12,13
c,14,15


In [68]:
pd.concat([daf1, daf2], axis = 1, keys = ['level1', 'level2'], sort=False)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,12.0,13.0
b,2,3,,
c,4,5,14.0,15.0


In [69]:
pd.concat({'level1' : daf1, 'level2' : daf2}, axis=1, sort=False )

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,12.0,13.0
b,2,3,,
c,4,5,14.0,15.0


- pd.concat([df1,df2], ignore_index =True)
기존 색인 값을 무시하고 숫자 색인으로 된 초기화 색인 부여
    + 각각의 색인이 합칠 때 불필요한 경우
    + 데이터 병합 후 중복 색인으로 문제가 되는 경우

In [70]:
data1 = pd.DataFrame(np.random.randn(3,4), index=list('abc'))
data2 = pd.DataFrame(np.random.randn(2,4), index=list('ab'))

In [71]:
data1

Unnamed: 0,0,1,2,3
a,1.619692,0.183272,-0.494509,-0.354736
b,-1.609587,-0.077786,0.183911,-1.000166
c,-1.135553,1.265423,-0.762658,0.68455


In [72]:
data2

Unnamed: 0,0,1,2,3
a,-0.065402,-1.754835,-0.754225,-1.687116
b,-0.933428,0.40298,-1.253601,0.651625


In [73]:
pd.concat([data1,data2])
# 'a'와 'b'가 중복 색인

Unnamed: 0,0,1,2,3
a,1.619692,0.183272,-0.494509,-0.354736
b,-1.609587,-0.077786,0.183911,-1.000166
c,-1.135553,1.265423,-0.762658,0.68455
a,-0.065402,-1.754835,-0.754225,-1.687116
b,-0.933428,0.40298,-1.253601,0.651625


In [74]:
pd.concat([data1,data2], ignore_index=True)

Unnamed: 0,0,1,2,3
0,1.619692,0.183272,-0.494509,-0.354736
1,-1.609587,-0.077786,0.183911,-1.000166
2,-1.135553,1.265423,-0.762658,0.68455
3,-0.065402,-1.754835,-0.754225,-1.687116
4,-0.933428,0.40298,-1.253601,0.651625


# 7.2 재형성과 피벗

# 7.2.1 계층적 색인으로 재형성하기
- unstack : DataFrame 형태로
- stack : Series 형태로(세로로 길게) / 색인이 계층적

In [75]:
data = pd.DataFrame(np.arange(6).reshape(2,3),
                   index = ['ohio', 'colorado'],
                    columns = ['one', 'two', 'three'])
#data.index.name = 'state'
data = data.rename_axis(index='state', columns="number")
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ohio,0,1,2
colorado,3,4,5


In [76]:
#columns -> rows
data_stk = data.stack()
data_stk

state     number
ohio      one       0
          two       1
          three     2
colorado  one       3
          two       4
          three     5
dtype: int32

In [77]:
#rows -> columns
data_stk.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ohio,0,1,2
colorado,3,4,5


In [78]:
data_stk

state     number
ohio      one       0
          two       1
          three     2
colorado  one       3
          two       4
          three     5
dtype: int32

In [79]:
data_stk.unstack(0)

state,ohio,colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [80]:
data_stk.unstack(1)

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ohio,0,1,2
colorado,3,4,5


# 7.2.2 피버팅으로 데이터 나열방식 바꾸기
- dataframe객체.pivot(row로 보낼 열 이름, columns으로 보낼 열 이름, dataframe에 채워 넣으 value)
    + set_index로 행에 대한 index 설정 -> unstack을 사용하여 형태를 변경하는 것을 한번에 해줌
    
- dataframe객체.pivot(' ',' ')
    + 마지막 인자를 생략하여 계층적 색인을 할 수 있다. (df에 column 값이 많은 경우)

In [81]:
import sklearn.datasets as sd
import pandas as pd

In [82]:
irisdb = sd.load_iris()
iris  = pd.DataFrame(irisdb.data, columns = irisdb.feature_names)
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [83]:
irisdb.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [84]:
target_dict = dict(zip([0,1,2], irisdb.target_names))
target_dict

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [85]:
target_data = [target_dict[tgdata] for tgdata in irisdb.target]
target_data

['setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',


In [86]:
iris['species'] = target_data

In [87]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [88]:
#Split indices in 50, make it to 0~49, 0-49, 0~49 , ,,,  
# index를 50개씩 잘라서 0~49, 0~49, 0~49 , ,,, 으로 만든다.
iris['ind'] = [i %50 for i in iris.index]
iris[51:100]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,ind
51,6.4,3.2,4.5,1.5,versicolor,1
52,6.9,3.1,4.9,1.5,versicolor,2
53,5.5,2.3,4.0,1.3,versicolor,3
54,6.5,2.8,4.6,1.5,versicolor,4
55,5.7,2.8,4.5,1.3,versicolor,5
56,6.3,3.3,4.7,1.6,versicolor,6
57,4.9,2.4,3.3,1.0,versicolor,7
58,6.6,2.9,4.6,1.3,versicolor,8
59,5.2,2.7,3.9,1.4,versicolor,9
60,5.0,2.0,3.5,1.0,versicolor,10


In [89]:
#'ind' -> row, 'species' -> columns
iris.pivot('ind', 'species')

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),sepal width (cm),petal length (cm),petal length (cm),petal length (cm),petal width (cm),petal width (cm),petal width (cm)
species,setosa,versicolor,virginica,setosa,versicolor,virginica,setosa,versicolor,virginica,setosa,versicolor,virginica
ind,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,5.1,7.0,6.3,3.5,3.2,3.3,1.4,4.7,6.0,0.2,1.4,2.5
1,4.9,6.4,5.8,3.0,3.2,2.7,1.4,4.5,5.1,0.2,1.5,1.9
2,4.7,6.9,7.1,3.2,3.1,3.0,1.3,4.9,5.9,0.2,1.5,2.1
3,4.6,5.5,6.3,3.1,2.3,2.9,1.5,4.0,5.6,0.2,1.3,1.8
4,5.0,6.5,6.5,3.6,2.8,3.0,1.4,4.6,5.8,0.2,1.5,2.2
5,5.4,5.7,7.6,3.9,2.8,3.0,1.7,4.5,6.6,0.4,1.3,2.1
6,4.6,6.3,4.9,3.4,3.3,2.5,1.4,4.7,4.5,0.3,1.6,1.7
7,5.0,4.9,7.3,3.4,2.4,2.9,1.5,3.3,6.3,0.2,1.0,1.8
8,4.4,6.6,6.7,2.9,2.9,2.5,1.4,4.6,5.8,0.2,1.3,1.8
9,4.9,5.2,7.2,3.1,2.7,3.6,1.5,3.9,6.1,0.1,1.4,2.5


# 7.3 데이터 변형

# 7.3.1 중복 제거하기
- dataframe객체.duplicated() : 각 row가 중복인지 True/False로 반환
- dataframe객체.drop_duplicates() : 겹치는 row를 하나만 살리고 나머지 삭제
- dataframe객체.drop_duplicates('column') : 특정 열에 기준하여 중복 삭제

default : 중복 값 중 가장 위에 있는 값을 남기고 나머지를 삭제한다.
keep = 'last' 옵션 : 가장 마지막 중복 값을 남기고 나머지를 삭제한다.

In [90]:
dudata = pd.DataFrame({'key1': ['one']*4+['two']*4,
                      'key2': [1,1,1,2,3,3,4,4]})
dudata

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,1
3,one,2
4,two,3
5,two,3
6,two,4
7,two,4


In [91]:
dudata.duplicated()

0    False
1     True
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [92]:
dudata.drop_duplicates()

Unnamed: 0,key1,key2
0,one,1
3,one,2
4,two,3
6,two,4


In [93]:
dudata

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,1
3,one,2
4,two,3
5,two,3
6,two,4
7,two,4


In [94]:
dudata['v1'] = range(8)
dudata

Unnamed: 0,key1,key2,v1
0,one,1,0
1,one,1,1
2,one,1,2
3,one,2,3
4,two,3,4
5,two,3,5
6,two,4,6
7,two,4,7


In [95]:
dudata.drop_duplicates()

Unnamed: 0,key1,key2,v1
0,one,1,0
1,one,1,1
2,one,1,2
3,one,2,3
4,two,3,4
5,two,3,5
6,two,4,6
7,two,4,7


In [96]:
dudata.drop_duplicates(['key1', 'key2'], keep='first')

Unnamed: 0,key1,key2,v1
0,one,1,0
3,one,2,3
4,two,3,4
6,two,4,6


In [97]:
dudata

Unnamed: 0,key1,key2,v1
0,one,1,0
1,one,1,1
2,one,1,2
3,one,2,3
4,two,3,4
5,two,3,5
6,two,4,6
7,two,4,7


In [98]:
dudata.drop_duplicates(['key1','key2'],inplace=True)
dudata

Unnamed: 0,key1,key2,v1
0,one,1,0
3,one,2,3
4,two,3,4
6,two,4,6


# 7.3.2 함수나 매핑 이용해 데이터 변형하기
- `map` : 한 객체(주로 Series / 하나의 열) 안에서 값의 부분집합을 변경하는데 사용
- Series객체.map(값을 변경할 dict, 함수 등등)

In [99]:
dudata1 = pd.DataFrame({'key1': ['one']*4+['two']*4,
                      'key2': [1,1,1,2,3,3,4,4]})
dudata1

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,1
3,one,2
4,two,3
5,two,3
6,two,4
7,two,4


In [100]:
dudata1['key2']

0    1
1    1
2    1
3    2
4    3
5    3
6    4
7    4
Name: key2, dtype: int64

In [101]:
dudata1['key2'].map({1: 100, 2:200})

0    100.0
1    100.0
2    100.0
3    200.0
4      NaN
5      NaN
6      NaN
7      NaN
Name: key2, dtype: float64

In [102]:
dudata1['key1'].map('I am {}'.format)

0    I am one
1    I am one
2    I am one
3    I am one
4    I am two
5    I am two
6    I am two
7    I am two
Name: key1, dtype: object

In [103]:
dudata1['key2'].map(lambda x: x if x < 4 else 1000)

0       1
1       1
2       1
3       2
4       3
5       3
6    1000
7    1000
Name: key2, dtype: int64

# 7.3.3 값 치환하기
- dataframe객체.replace(바꾸고 싶은 값, 바꿀 값) : 바꿀 값이 여러 개인 경우 인자에 리스트를 넘긴다.
    + 주로 df.replace(바꾸고 싶은 값, np.nan)으로 치환 후 df.fillna()를 사용하여 치환한다.

In [104]:
ovdata = pd.DataFrame({'a': [1,1,2,3,4,5,6,6,7],
                      'b': [3,4,4,5,2,1,4,6,3]})
ovdata

Unnamed: 0,a,b
0,1,3
1,1,4
2,2,4
3,3,5
4,4,2
5,5,1
6,6,4
7,6,6
8,7,3


In [105]:
ovdata.replace([1,6], ['new 1', 'new 6'])

Unnamed: 0,a,b
0,new 1,3
1,new 1,4
2,2,4
3,3,5
4,4,2
5,5,new 1
6,new 6,4
7,new 6,new 6
8,7,3


In [106]:
ovdata.replace({1: 'new 1 by dict', 6: 'new 6 by dict'})

Unnamed: 0,a,b
0,new 1 by dict,3
1,new 1 by dict,4
2,2,4
3,3,5
4,4,2
5,5,new 1 by dict
6,new 6 by dict,4
7,new 6 by dict,new 6 by dict
8,7,3


# 7.3.4 축 색인 이름 바꾸기
- df.rename(index = '' , columns = '', inplace=T/F)
    + 일부만 변경하고 싶을 때: df.rename(index = {변경하고자 하는 인덱스: 변경하려는 인덱스 이름} , columns = '' )

In [107]:
redata = pd.DataFrame(np.random.randint(0,10,(3,4)),
                     index = list('abc'),
                     columns = list('ABCD'))
redata

Unnamed: 0,A,B,C,D
a,7,8,2,0
b,7,5,4,6
c,0,0,1,4


In [108]:
redata.rename(columns = {'A' : 'A_re', 'B': 'B_res'},
             index = {'c': 'c_re'})

Unnamed: 0,A_re,B_res,C,D
a,7,8,2,0
b,7,5,4,6
c_re,0,0,1,4


In [109]:
redata

Unnamed: 0,A,B,C,D
a,7,8,2,0
b,7,5,4,6
c,0,0,1,4


In [110]:
redata.rename(columns = {'A': 'A_re'}, inplace = True)
redata

Unnamed: 0,A_re,B,C,D
a,7,8,2,0
b,7,5,4,6
c,0,0,1,4


# 7.3.5 개별화와 양자화
분류나 그룹화가 필요한 연속형 데이터를 다룰 때 적합
- pd.cut(데이터, 분할 기준, right = T/F, labels = ['분할 구간마다 부여할 이름'])

In [111]:
ages = np.random.randint(40, size =20)
ages

array([36,  3, 18, 24, 21,  0, 18, 15, 38,  8, 34, 35,  6, 24, 10, 39, 38,
        6, 21, 15])

In [112]:
classify = np.arange(0,50,10)
classify

array([ 0, 10, 20, 30, 40])

In [113]:
cats = pd.cut(ages, classify)
cats

[(30, 40], (0, 10], (10, 20], (20, 30], (20, 30], ..., (30, 40], (30, 40], (0, 10], (20, 30], (10, 20]]
Length: 20
Categories (4, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40]]

In [114]:
ages.min()

0

In [115]:
ages.max()

39

In [116]:
cats.codes

array([ 3,  0,  1,  2,  2, -1,  1,  1,  3,  0,  3,  3,  0,  2,  0,  3,  3,
        0,  2,  1], dtype=int8)

In [117]:
cats.categories

IntervalIndex([(0, 10], (10, 20], (20, 30], (30, 40]],
              closed='right',
              dtype='interval[int64]')

In [118]:
# ']'이 포함 ')' 포함하지 않음
# right = False : 각 구간의 오른쪽에 ']'을 사용하지 않고 ')'를 사용하겠다.
cats2 = pd.cut(ages, classify, right =False)
cats2.categories

IntervalIndex([[0, 10), [10, 20), [20, 30), [30, 40)],
              closed='left',
              dtype='interval[int64]')

In [119]:
ages

array([36,  3, 18, 24, 21,  0, 18, 15, 38,  8, 34, 35,  6, 24, 10, 39, 38,
        6, 21, 15])

In [120]:
cats2.codes

array([3, 0, 1, 2, 2, 0, 1, 1, 3, 0, 3, 3, 0, 2, 1, 3, 3, 0, 2, 1],
      dtype=int8)

In [121]:
cat_label_result = pd.cut(ages, classify, 
                          labels = ['youth', 'youngadult', 'middleaged', 'senior' ])

In [122]:
cat_label_result.categories

Index(['youth', 'youngadult', 'middleaged', 'senior'], dtype='object')

In [123]:
cat_label_result.codes

array([ 3,  0,  1,  2,  2, -1,  1,  1,  3,  0,  3,  3,  0,  2,  0,  3,  3,
        0,  2,  1], dtype=int8)

In [124]:
cat_label_result.categories[cat_label_result.codes]

Index(['senior', 'youth', 'youngadult', 'middleaged', 'middleaged', 'senior',
       'youngadult', 'youngadult', 'senior', 'youth', 'senior', 'senior',
       'youth', 'middleaged', 'youth', 'senior', 'senior', 'youth',
       'middleaged', 'youngadult'],
      dtype='object')

pd.cut() vs pd.qcut()
- pd.cut() : 사용자가 지정한 구간. 구간마다 들어간 데이터 크기가 다를 수 있다.
- pd.qcut() : 해당 데이터 집단의 quantile로 구간을 정한다. 구간마다 데이터 크기가 비슷

In [125]:
randata = np.random.randn(500)
randata

array([ 1.75200215e+00, -4.99188849e-01, -7.85117353e-01,  6.95346948e-01,
       -1.73783555e+00,  1.15937842e-01, -2.78431273e-01, -9.52648969e-01,
        1.81871948e-02,  6.43364384e-01,  2.88987252e-01,  8.54362103e-01,
        7.88241397e-01, -6.68806467e-01,  2.41658348e+00, -1.64292078e+00,
        1.01851430e+00, -2.76615099e+00, -2.26540133e-01, -3.25381747e-01,
        3.11876414e-01, -9.63040516e-01,  1.21957832e+00, -3.42965918e-01,
        3.78115332e-02, -1.22883074e+00, -1.12102367e-01, -1.61784945e+00,
       -6.19345906e-01, -6.38850855e-01, -5.01962903e-01, -1.24136489e+00,
        2.07144458e+00, -1.53726879e-01,  6.37192377e-01,  5.43418788e-01,
       -1.36126784e+00, -3.05260856e-01, -4.20725188e-01,  4.56524746e-01,
       -4.67763888e-01,  1.34368359e+00,  9.14300880e-01, -1.55424246e+00,
        5.52012853e-01,  7.69749476e-01,  5.16823674e-01,  2.42901178e-01,
        8.98826925e-01, -1.25926194e+00, -2.84998237e-01,  3.91187748e-01,
       -3.93445609e-01, -

In [126]:
qarr = pd.qcut(randata, 4)
qarr.categories

IntervalIndex([(-2.767, -0.7], (-0.7, -0.0641], (-0.0641, 0.552], (0.552, 3.25]],
              closed='right',
              dtype='interval[float64]')

In [127]:
qarr.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-2.767, -0.7]",125,0.25
"(-0.7, -0.0641]",125,0.25
"(-0.0641, 0.552]",125,0.25
"(0.552, 3.25]",125,0.25


In [128]:
pd.value_counts(pd.qcut(randata, 4))

(0.552, 3.25]       125
(-0.0641, 0.552]    125
(-0.7, -0.0641]     125
(-2.767, -0.7]      125
dtype: int64

# 7.3.6 특이값 찾아내고 제외하기

In [129]:
np.random.seed(123)

In [130]:
dataf = pd.DataFrame(np.random.randn(1000, 4))
dataf

Unnamed: 0,0,1,2,3
0,-1.085631,0.997345,0.282978,-1.506295
1,-0.578600,1.651437,-2.426679,-0.428913
2,1.265936,-0.866740,-0.678886,-0.094709
3,1.491390,-0.638902,-0.443982,-0.434351
4,2.205930,2.186786,1.004054,0.386186
...,...,...,...,...
995,-0.499897,0.587647,-0.926542,1.736982
996,-0.459550,0.125822,-1.119947,-0.521887
997,-2.013430,-0.028708,-0.103142,-1.761313
998,-0.185167,0.504077,1.354567,-0.907952


In [131]:
dataf[3][np.abs(dataf[3]>2)]

25     2.598304
64     2.958625
182    3.571579
272    2.273105
287    2.555894
348    2.271034
358    2.079539
647    2.420233
670    2.355236
718    2.521371
768    2.717429
785    2.330464
797    2.104499
832    2.710260
856    2.393570
877    2.274703
924    2.018714
948    2.540514
Name: 3, dtype: float64

In [132]:
dataf.columns = ['a','b','c','d']
dataf

Unnamed: 0,a,b,c,d
0,-1.085631,0.997345,0.282978,-1.506295
1,-0.578600,1.651437,-2.426679,-0.428913
2,1.265936,-0.866740,-0.678886,-0.094709
3,1.491390,-0.638902,-0.443982,-0.434351
4,2.205930,2.186786,1.004054,0.386186
...,...,...,...,...
995,-0.499897,0.587647,-0.926542,1.736982
996,-0.459550,0.125822,-1.119947,-0.521887
997,-2.013430,-0.028708,-0.103142,-1.761313
998,-0.185167,0.504077,1.354567,-0.907952


In [133]:
np.abs(dataf) > 3

Unnamed: 0,a,b,c,d
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [134]:
# 각 열에 대해 해당 조건이 만족하는지
# default:axis=0
( np.abs(dataf) > 3).any()

a     True
b    False
c     True
d     True
dtype: bool

In [135]:
# 각 행에 대해 해당 조건이 만족하는지
(np.abs(dataf)>3).any(1)

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

np.sign(dataframe객체) : 음수는 -1로 양수는 +1로 바꿔준다.

In [136]:
np.sign(dataf)

Unnamed: 0,a,b,c,d
0,-1.0,1.0,1.0,-1.0
1,-1.0,1.0,-1.0,-1.0
2,1.0,-1.0,-1.0,-1.0
3,1.0,-1.0,-1.0,-1.0
4,1.0,1.0,1.0,1.0
...,...,...,...,...
995,-1.0,1.0,-1.0,1.0
996,-1.0,1.0,-1.0,-1.0
997,-1.0,-1.0,-1.0,-1.0
998,-1.0,1.0,1.0,-1.0


In [137]:
np.sign(dataf)*3

Unnamed: 0,a,b,c,d
0,-3.0,3.0,3.0,-3.0
1,-3.0,3.0,-3.0,-3.0
2,3.0,-3.0,-3.0,-3.0
3,3.0,-3.0,-3.0,-3.0
4,3.0,3.0,3.0,3.0
...,...,...,...,...
995,-3.0,3.0,-3.0,3.0
996,-3.0,3.0,-3.0,-3.0
997,-3.0,-3.0,-3.0,-3.0
998,-3.0,3.0,3.0,-3.0


In [138]:
dataf[np.abs(dataf)>3] = np.sign(dataf)*3
dataf
# 최댓값을 3, 최솟값을 -3으로 변환하여 주는 것과 같다.

Unnamed: 0,a,b,c,d
0,-1.085631,0.997345,0.282978,-1.506295
1,-0.578600,1.651437,-2.426679,-0.428913
2,1.265936,-0.866740,-0.678886,-0.094709
3,1.491390,-0.638902,-0.443982,-0.434351
4,2.205930,2.186786,1.004054,0.386186
...,...,...,...,...
995,-0.499897,0.587647,-0.926542,1.736982
996,-0.459550,0.125822,-1.119947,-0.521887
997,-2.013430,-0.028708,-0.103142,-1.761313
998,-0.185167,0.504077,1.354567,-0.907952


# 7.3.7 치환과 임의 샘플링
np.random.permutation(바꾸고 싶은 만큼의 길이)

In [139]:
pdata = pd.DataFrame(np.arange(20*4).reshape(20,4))
pdata

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23
6,24,25,26,27
7,28,29,30,31
8,32,33,34,35
9,36,37,38,39


In [140]:
#0~3 사이에서 랜덤으로 순서를 매겨 반환
sampler = np.random.permutation(4)
sampler

array([3, 1, 2, 0])

In [141]:
#해당하는 row를 가져온다
pdata.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


In [142]:
#iloc 사용 가능
pdata.iloc[sampler]

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


'임의로' 뽑아서 '일부분만' 재배치 하고 싶은 경우
1. np.random.permutation(len(df)): 행 전체에 대해 랜덤 순서를 매긴다
2. np.random.permutation(len(df))[:10] : 랜덤 순서에서 앞의 10개만 뽑는다.

= 즉, 전체에서 랜덤하게 10개를 뽑아 배치시키는 것과 같다.

In [143]:
pdata.take(np.random.permutation(len(pdata))[:10])

Unnamed: 0,0,1,2,3
17,68,69,70,71
19,76,77,78,79
10,40,41,42,43
3,12,13,14,15
16,64,65,66,67
15,60,61,62,63
8,32,33,34,35
18,72,73,74,75
7,28,29,30,31
12,48,49,50,51


# 7.3.8 표시자/더미 변수
데이터가 범주형인 경우: 해당 열에 k가지의 범주형 값이 있을 때 k개의 열이 있는 df를 만들고 0과 1fh codnsms rjt
- pd.get_dummies()

In [144]:
df = pd.DataFrame({'key' : list('bbacab'),
                  'data1': range(6,12)})
df

Unnamed: 0,key,data1
0,b,6
1,b,7
2,a,8
3,c,9
4,a,10
5,b,11


In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
key      6 non-null object
data1    6 non-null int64
dtypes: int64(1), object(1)
memory usage: 136.0+ bytes


In [146]:
pd.get_dummies(df)

Unnamed: 0,data1,key_a,key_b,key_c
0,6,0,1,0
1,7,0,1,0
2,8,1,0,0
3,9,0,0,1
4,10,1,0,0
5,11,0,1,0


In [147]:
#열 하나만 지정하여 원핫인코딩 할 수도 있다.
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


여러 범주형 열을 넣는 경우

In [148]:
many_categories = pd.DataFrame({'col1': list('abccbaab'),
                               'col2': list('ABBBCAAB')})
many_categories

Unnamed: 0,col1,col2
0,a,A
1,b,B
2,c,B
3,c,B
4,b,C
5,a,A
6,a,A
7,b,B


In [149]:
pd.get_dummies(many_categories)

Unnamed: 0,col1_a,col1_b,col1_c,col2_A,col2_B,col2_C
0,1,0,0,1,0,0
1,0,1,0,0,1,0
2,0,0,1,0,1,0
3,0,0,1,0,1,0
4,0,1,0,0,0,1
5,1,0,0,1,0,0
6,1,0,0,1,0,0
7,0,1,0,0,1,0


get_dummies와 cut 같이 사용하기

In [150]:
values = np.random.randint(10, size=10)
values

array([8, 0, 9, 2, 2, 2, 9, 3, 3, 3])

In [151]:
bins = [i for i in range(0,11,2)]
bins

[0, 2, 4, 6, 8, 10]

In [154]:
pd.cut(values, bins)

[(6.0, 8.0], NaN, (8.0, 10.0], (0.0, 2.0], (0.0, 2.0], (0.0, 2.0], (8.0, 10.0], (2.0, 4.0], (2.0, 4.0], (2.0, 4.0]]
Categories (5, interval[int64]): [(0, 2] < (2, 4] < (4, 6] < (6, 8] < (8, 10]]

In [155]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0, 2]","(2, 4]","(4, 6]","(6, 8]","(8, 10]"
0,0,0,0,1,0
1,0,0,0,0,0
2,0,0,0,0,1
3,1,0,0,0,0
4,1,0,0,0,0
5,1,0,0,0,0
6,0,0,0,0,1
7,0,1,0,0,0
8,0,1,0,0,0
9,0,1,0,0,0


# 7.4.1 문자열 다루기
- split()
- strip() : 공백문자를 제거
- "".join(list) : 리스트 안의 문자열 요소끼리 사이를 " "으로 채워 연결하여 문자열 반환

In [156]:
a = 'python is so easy'
a.split(' ')

['python', 'is', 'so', 'easy']

내부에서 문자열 찾아 인덱스 반환하기
- 문자열.index('찾으려는 문자') : 문자열의 인덱스 반환. 없는 경우 error 발생
- 문자열.find('찾으려는 문자') : 문자열의 인덱스 반환. 없는 경우 -1 반환

In [157]:
a.index('o')

4

In [158]:
a.find('o')

4

In [159]:
a.find('Z')

-1

In [160]:
split_a = a.split(' ')
split_a

['python', 'is', 'so', 'easy']

In [162]:
adot = '::'.join(split_a)
adot

'python::is::so::easy'

In [163]:
a.count('o')

2

In [164]:
a[:4].upper() + a[4:]

'PYTHon is so easy'

In [165]:
a.lower()

'python is so easy'

# 7.4.2 정규표현식
패턴(정규표현식)을 지정하여 텍스트를 걸러내는 작업

regex : 단일표현식/정규표현언어로 구성된 문자열

re모듈함수
1. 패턴 매칭
2. 치환 분리 : regex.sub('대체할 텍스트', '가공할 문자열') '가공할 문자열'에서 regex(패턴)에 해당하는 텍스트를 '대체할 텍스트'로 치환
3. 분리

정규표현식 메서드
- 정규표현식 객체 이름(regex) = re.compile(패턴 문자열, flags=)
- regex.split(가공할 문자열) : 패턴과 일치하는 것으로 분리
- regex.findall(가공할 문자열) : 패턴과 일치하는 것 모두 찾기
- regex.search(가공할 문자열) : 패턴과 일치하는 첫 번째 요소 찾기
- regex.match(가공할 문자열) : 문자 처음 부분부터 찾는다. 만약 첫 부분부터 정확히 일치하지 않는다면 None 반환
- regex.sub('대체할 문자열', '가공할 문자열') : 패턴에 일치하는 것들을 대체할 문자열로 모두 치환

In [166]:
import re

In [169]:
texts = 'foo    bar\t baz   \tqux'
print(texts)

foo    bar	 baz   	qux


In [170]:
re.split('\s+', texts)

['foo', 'bar', 'baz', 'qux']

정규표현식을 객체로 저장하여 사용 가능

In [171]:
regex = re.compile('\s+')

In [173]:
regex.split(texts)

['foo', 'bar', 'baz', 'qux']

In [174]:
regex.findall(texts)

['    ', '\t ', '   \t']

In [176]:
regex.search(texts)

<re.Match object; span=(3, 7), match='    '>

In [177]:
txt = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gamil.com"""

In [178]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [179]:
regex2 = re.compile(pattern, flags = re.IGNORECASE)
# flags = re.IGNORECASE 대소문자 구분없이

In [180]:
regex2.split(txt)

['Dave ', '\nSteve ', '\nRob ', '']

In [181]:
regex2.findall(txt)

['dave@google.com', 'steve@gmail.com', 'rob@gamil.com']

In [182]:
regex2.search(txt)

<re.Match object; span=(5, 20), match='dave@google.com'>

In [183]:
m = regex2.search(txt)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [184]:
txt[m.start():m.end()]

'dave@google.com'

In [186]:
print(regex2.match(txt))

None


패턴에 해당하는 문자열 치환하기

regex.sub('치환할 문자열', '문자열')

In [187]:
print(txt)

Dave dave@google.com
Steve steve@gmail.com
Rob rob@gamil.com


In [188]:
print(regex2.sub('치환하고 붙일 요소', txt))

Dave 치환하고 붙일 요소
Steve 치환하고 붙일 요소
Rob 치환하고 붙일 요소


In [190]:
pattern2 = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [191]:
regex3 = re.compile(pattern2, flags = re.IGNORECASE)

In [193]:
m2 = regex3.match('lhj950211@naver.com')
m2

<re.Match object; span=(0, 19), match='lhj950211@naver.com'>

In [194]:
m2.groups()

('lhj950211', 'naver', 'com')

In [195]:
regex3.split('lhj950211@naver.com')

['', 'lhj950211', 'naver', 'com', '']

In [196]:
regex3.findall(txt)

[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gamil', 'com')]

In [198]:
print(regex3.sub(r'유저이름 : \1 , 도메인 : \2, suffix : \3 ', txt))

Dave 유저이름 : dave , 도메인 : google, suffix : com 
Steve 유저이름 : steve , 도메인 : gmail, suffix : com 
Rob 유저이름 : rob , 도메인 : gamil, suffix : com 
