# 9. 데이터 수집과 그룹 연산
> ## 그룹별 연산과 변형

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1': list('aabba'),
               'key2': ['one', 'two', 'one', 'two', 'one'],
               'data1': np.random.randn(5),
               'data2': np.random.randn(5)})

df

Unnamed: 0,data1,data2,key1,key2
0,1.264161,-2.500765,a,one
1,1.483911,-0.252438,a,two
2,-1.162303,-0.605459,b,one
3,0.824166,-0.486711,b,two
4,0.911748,-0.58048,a,one


In [7]:
people = DataFrame(np.random.randn(5, 5),
                  columns = list('abcde'),
                  index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1,2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.161403,1.171053,-0.120861,1.222924,-1.889697
Steve,0.92656,0.411587,1.463412,-0.681698,-0.607678
Wes,0.261349,,,-1.47076,0.830559
Jim,0.935802,1.426548,-1.630077,-0.128581,0.527958
Travis,-0.213895,0.664486,1.493975,-1.165562,-0.748953


In [3]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.21994,-1.111228
b,-0.169069,-0.546085


In [5]:
pd.merge(df, k1_means, left_on = 'key1', right_index= True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,1.264161,-2.500765,a,one,1.21994,-1.111228
1,1.483911,-0.252438,a,two,1.21994,-1.111228
4,0.911748,-0.58048,a,one,1.21994,-1.111228
2,-1.162303,-0.605459,b,one,-0.169069,-0.546085
3,0.824166,-0.486711,b,two,-0.169069,-0.546085


- 집계 함수 칼럼 병합을 위해 먼저 집계 후 병합하는 과정이 일반적임

In [11]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.069619,0.917769,0.686557,-0.471132,-0.602697
two,0.931181,0.919068,-0.083332,-0.40514,-0.03986


___
## 1. apply: 분리 - 적용 - 병합

- apply 메서드는 객체를 여러 조각으로 나누어 전달된 함수를 각 조각에 일괄적으로 적용 후 병합

In [13]:
tips = pd.read_csv('data/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [14]:
def top(df, n=5, column = 'tip_pct'):
    return df.sort_values(by = column)[-n:]

In [15]:
top(tips, n = 6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [20]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


- top 함수가 그룹 별로 적용 후 pandas.concat을 이용해 하나로 합쳐짐

In [25]:
tips.groupby(['smoker','day']).apply(top, n = 1, column = 'total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


- apply 메서드에 넘길 함수가 추가 인자를 요구한다면 함수 이름 뒤에 추가

In [26]:
tips.groupby('smoker', group_keys= False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [27]:
tips.groupby('smoker', as_index= False).apply(top)

Unnamed: 0,Unnamed: 1,total_bill,tip,smoker,day,time,size,tip_pct
0,88,24.71,5.85,No,Thur,Lunch,2,0.236746
0,185,20.69,5.0,No,Sun,Dinner,5,0.241663
0,51,10.29,2.6,No,Sun,Dinner,2,0.252672
0,149,7.51,2.0,No,Thur,Lunch,2,0.266312
0,232,11.61,3.39,No,Sat,Dinner,2,0.29199
1,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
1,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
1,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
1,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
1,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


- group_key = False로 계층 인덱스 방지 가능 (as_index와 유사한 기능)
___
## 2. 변위치 분석과 버킷 분석

In [30]:
frame = DataFrame({'data1': np.random.randn(1000),
                  'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor.head()

0    (-1.774, -0.0915]
1    (-1.774, -0.0915]
2    (-1.774, -0.0915]
3    (-1.774, -0.0915]
4     (-3.463, -1.774]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.463, -1.774] < (-1.774, -0.0915] < (-0.0915, 1.591] < (1.591, 3.273]]

- 7장에서 살펴본 cut, qcut으로 데이터 분할 가능

In [34]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
           'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats)

data1                   
(-3.463, -1.774]   count     47.000000
                   max        2.063019
                   mean      -0.199567
                   min       -2.149625
(-1.774, -0.0915]  count    408.000000
                   max        2.890597
                   mean       0.031216
                   min       -2.477092
(-0.0915, 1.591]   count    492.000000
                   max        3.326183
                   mean       0.074016
                   min       -2.953025
(1.591, 3.273]     count     53.000000
                   max        2.570126
                   mean       0.304267
                   min       -1.985738
Name: data2, dtype: float64

- 분위 별 통계 계산
___
## 3. 예제: 그룹에 국한된 값으로 누락된 값 채우기

In [36]:
s = Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1   -1.372429
2         NaN
3    0.594530
4         NaN
5    2.909907
dtype: float64

In [37]:
s.fillna(s.mean())

0    0.710669
1   -1.372429
2    0.710669
3    0.594530
4    0.710669
5    2.909907
dtype: float64

- .fillna로 na 값 대입 가능

In [41]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
         'Oregon', 'Nevada', 'California', 'Idaho']

group_key = ['East'] * 4 + ['West'] * 4

data = Series(np.random.randn(8), index = states)

data[['Vermont', 'Nevada', 'Idaho']] = np.nan

data

Ohio         -0.396740
New York     -0.055863
Vermont            NaN
Florida      -0.114082
Oregon       -0.542745
Nevada             NaN
California    2.039731
Idaho              NaN
dtype: float64

In [42]:
data.groupby(group_key).mean()

East   -0.188895
West    0.748493
dtype: float64

In [45]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio         -0.396740
New York     -0.055863
Vermont      -0.188895
Florida      -0.114082
Oregon       -0.542745
Nevada        0.748493
California    2.039731
Idaho         0.748493
dtype: float64

- 그룹 별 특성값을 na에 대입 가능
___
## 4. 예제: 랜덤 표본과 순열

In [48]:
## 트럼프덱
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1,11)) + [10] * 3)* 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = Series(card_val, index = cards)
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [49]:
def draw(deck, n = 5):
    return deck.take(np.random.permutation(len(deck))[:n])

draw(deck)

10S    10
5C      5
8C      8
10H    10
3S      3
dtype: int64

In [51]:
get_suit = lambda card: card[-1] #마지막 글자 
deck.groupby(get_suit).apply(draw, n = 2)

C  AC      1
   7C      7
D  6D      6
   JD     10
H  6H      6
   JH     10
S  10S    10
   QS     10
dtype: int64

In [52]:
deck.groupby(get_suit, group_keys = False).apply(draw, n=2)

6C     6
2C     2
5D     5
7D     7
6H     6
3H     3
KS    10
8S     8
dtype: int64