# 7장 데이터 준비하기: 다듬기, 변형, 병합
## 데이터 변형

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

___
## 1. 중복 제거하기

In [2]:
data = DataFrame({'k1': ['one']*3 + ['two']*4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


- 상기 메서드는 모든 칼럼에 적용

In [7]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [10]:
data.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [8]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [12]:
data.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [11]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


- keep = 'last'로 처음 값 대신 마지막 값 유지
___
## 2. 함수나 매핑 이용해 데이터 변형하기

In [13]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [20]:
meat_to_animal = {'bacon': 'pig',
                 'pulled pork': 'pig',
                 'pastrami': 'cow',
                 'corned beef': 'cow',
                 'honey ham': 'pig',
                 'nova lox': 'salmon'}
meat_to_animal

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}

In [21]:
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


- Series의 map 메서드는 사전류의 객체나 함수를 받을 수 있음
- 소문자로 변경 함수 > meat_to_animal과 매핑

In [25]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

- 위와 같이 lambda 함수를 사용해도 결과는 같음
___
## 3. 값 치환하기

In [27]:
data = Series([1, -999, 2, -999, -1000, 3])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

In [28]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

- replace 메서드로 특정 값을 원하는 값으로 치환

In [29]:
data.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [30]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [31]:
data.replace({-999: np.nan, -10000: 0})

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

- 복수의 값 치환도 가능
- 리스트나 딕셔너리 형대로 복수의 원하는 값 지정도 가능
___
## 4. 축 색인 이름 바꾸기

In [34]:
data = DataFrame(np.arange(12).reshape(3,4),
                index = ['Ohio', 'Colorado', 'New York'],
                columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [38]:
data.index = data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


- Series와 마찬가지로 축 색인에도 map 메서드 사용 가능

In [39]:
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


- str.title은 대문자/소문자 혼용해서 단어 표기

In [42]:
data.rename(index = {'Ohio': 'Indiana'},
            columns = {'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


- 딕셔너리 활용해서 원하는 값만 수정하는 것도 가능

In [44]:
_ = data.rename(index = {'OHIO': 'Indiana'}, inplace = True)
data

Unnamed: 0,one,two,three,four
Indiana,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


- inplace = True 옵션으로 바로 원본 데이터를 수정
___
## 5. 개별화와 양자화
- 연속성 데이터는 분석을 위해 그룹별로 나누기도 하며, pandas에서 해당 기능 제공

In [46]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25 ,35 ,60 ,100]

In [48]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

- Pandas의 categorical 객체를 반환

In [50]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [51]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [53]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

- 상기 객체는 그룹 이름이 담긴 배열과 유사
- R의 factor와 유사

In [54]:
pd.cut(ages, bins, right = False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

- right = False 옵션으로 대괄호와 괄호 위치 변경 가능
- 대괄호는 포함, 괄호는 포함하지 않음

In [55]:
group_names = ['Youth', 'Young_Adult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels= group_names)

[Youth, Youth, Youth, Young_Adult, Youth, ..., Young_Adult, Senior, MiddleAged, MiddleAged, Young_Adult]
Length: 12
Categories (4, object): [Youth < Young_Adult < MiddleAged < Senior]

- Category 이름 지정도 가능 **(R의 Factor와 유사!)**

In [58]:
pd.cut(ages, 4, precision= 0)

[(20.0, 30.0], (20.0, 30.0], (20.0, 30.0], (20.0, 30.0], (20.0, 30.0], ..., (30.0, 40.0], (51.0, 61.0], (40.0, 51.0], (40.0, 51.0], (30.0, 40.0]]
Length: 12
Categories (4, interval[float64]): [(20.0, 30.0] < (30.0, 40.0] < (40.0, 51.0] < (51.0, 61.0]]

- 범위를 명시하지 않고 구간 수만 지정해 주면 알아서 균등분배 수행

In [59]:
data = np.random.randn(1000)
data

array([ 1.30855094, -0.1795363 ,  0.43819842,  0.62439128,  0.20066995,
       -1.06400002, -0.21456048,  0.15783572, -0.69425238, -0.43162392,
        0.88354497, -0.44411045,  1.70309434,  0.91975934,  1.64717624,
        0.34083714,  0.05174917, -1.54058968, -0.4464384 ,  0.56840779,
       -0.54552677, -0.1077738 , -1.56803082, -0.73145512, -0.74489842,
        0.0945617 ,  0.45501831, -0.63333548, -0.69232011,  0.35704707,
       -0.48210368, -0.26597348, -0.51377582, -1.17146603,  0.86739283,
       -1.89892834, -0.47809189, -0.29748159,  0.07631498, -0.08699996,
        1.0890907 , -0.05199775, -0.4533215 ,  0.26553623,  0.42380786,
       -1.65260683, -0.63009022,  0.40389784,  0.71426838,  1.46561554,
       -0.18361714, -0.82268165, -0.48244584,  1.56814687, -1.00472165,
        1.02534212,  0.1913962 , -0.65464182,  0.40219685,  0.12756573,
       -0.3091932 , -1.6395116 ,  0.38344744, -0.23214321, -1.72781736,
       -0.50237515, -0.51393403,  1.55004991,  1.18525831,  0.20

In [60]:
pd.qcut(data, 4)

[(0.668, 3.112], (-0.631, 0.015], (0.015, 0.668], (0.015, 0.668], (0.015, 0.668], ..., (-0.631, 0.015], (-2.705, -0.631], (-2.705, -0.631], (-2.705, -0.631], (0.668, 3.112]]
Length: 1000
Categories (4, interval[float64]): [(-2.705, -0.631] < (-0.631, 0.015] < (0.015, 0.668] < (0.668, 3.112]]

In [64]:
pd.value_counts(pd.qcut(data, 4))

(0.668, 3.112]      250
(0.015, 0.668]      250
(-0.631, 0.015]     250
(-2.705, -0.631]    250
dtype: int64

- pd.cut 대신 pd.qcut이란 메서드도 있음
- pd.cut은 값 기준으로 데이터를 나누는 반면, pd.qcut은 수량 기준으로 데이터를 나눔
- pd.cut은 *표본* 변위치 기반, pd.qcut은 *표준* 변위치 사용

In [65]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(1.246, 3.112], (-1.194, 0.015], (0.015, 1.246], (0.015, 1.246], (0.015, 1.246], ..., (-1.194, 0.015], (-1.194, 0.015], (-1.194, 0.015], (-2.705, -1.194], (0.015, 1.246]]
Length: 1000
Categories (4, interval[float64]): [(-2.705, -1.194] < (-1.194, 0.015] < (0.015, 1.246] < (1.246, 3.112]]

- qcut에서 0~1까지 변위치 지정 가능
___
## 6. 특이값 찾아내고 제외하기

In [68]:
np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [70]:
col = data[3]
col[np.abs(col)>3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

- N(0,1)에서 3이 넘는 값은 이상치로 판정

In [76]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


- 조건에 부합하는 로우(값) 찾는데 any 메서드 사용

In [80]:
data[np.abs(data)>3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


- np.sign 함수는 특정 값을 1 또는 -1로 반환
- np.sign에 3을 곱해 abs(3) 초과 값들을 abs(3)으로 상한치 부여
___
## 7. 치환과 임의 샘플링

In [81]:
df = DataFrame(np.arange(5*4).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [82]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 3, 4])

In [83]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [95]:
df.take(sampler[:3], axis=1)

Unnamed: 0,1,0,2
0,1,0,2
1,5,4,6
2,9,8,10
3,13,12,14
4,17,16,18


- np.random.permutation은 지정 값 만큼의 길이로 이루어진 랜덤 정수 배열을 생성
- 이를 활용해서 take 함수에서 사용 가능

In [97]:
df.take(np.random.permutation(len(df)))

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [98]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)
sampler

array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])

In [100]:
draws = bag.take(sampler)
draws

array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

- 표본생성에 randint, randn 등의 함수 사용 가능
___
## 8. 표시자/더미 변수

In [101]:
df = DataFrame({'key': list('bbacab'),
               'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [102]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


- get_dummies 함수로 factor를 더미변수화함

In [103]:
dummies = pd.get_dummies(df['key'], prefix= 'key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [116]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


- prefix 인자로 접두어 추가 가능
- join 함수로 열 병합

___
### MovieLens 데이터로 복수의 카테고리 더미변수화 연습

In [126]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('d:/git/Repository/Jupyter/Python_for_DA/data/movielens/movies.dat', sep = '::',
                      header = None, names = mnames)
movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [127]:
genre_iter = (set(x.split('|')) for x in movies.genres)
genre_iter

<generator object <genexpr> at 0x0000021EF3664990>

In [129]:
genres = sorted(set.union(*genre_iter))
genres

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [134]:
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns = genres)
dummies.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [138]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.head()

Unnamed: 0,movie_id,title,genres,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
