# Split - Apply - Combine strategy

![](https://img1.daumcdn.net/thumb/R800x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F9978503F5B8264490F)

In [1]:
# '데이터' 관련 하여 어떠한 문제 해결에 있어서

# split - apply - combine  전략 사용.
# (분할 - 적용 - 병합)

#  'R; 에서도 이와 같이 구현되었는데  (
#      그룹으로 나누고 (split)
#      동일한 연산 수행후에 (apply)
#      결과를 합치는 과정 (combine)

# 원래 R 에서 구현하던 방식. 이걸 고안하신 분의 논문 "Strategy of Split-Apply-Combine" 에서 주창
# 꽤 많은 문제들이 Split-Apply-Combine 으로 '문제해결' 을 할수 있슴을 주장

# 엑셀등에서도 많이 하는 작업

# 가령 
#     ex) '일별'로 묶어서 일별 매출 구하기
#     ex) '상품 카테고리별'로 묶어서 상품이 특정 기간의 매출 구하기
#     ex) '유저' 별 매출


# 여기서 복잡해 지는 상황은 바로 'apply' 과정입니다

# apply 과정
#    - aggregate() agg() : N => 1    집계하는 연산, 'N개를 입력' 받아서 '1개의 결과'를 만들어 내기    ex)합계
#    - transform() : N => N'       'N개를 입력'받아서  '전체' 를 바꿈
#    - apply() : N => 1, N, M...   'N개를 입력'받아서  '1개' 'N개' 혹은 'M개' 의 결과 를 만들어 냄.
#    - filter() : N => N보다 같거나 적은 


# ※ 나중에 보면, apply() 만으로도 모~든 것들이 가능해짐을 알게 된다.

In [2]:
import pandas as pd
import numpy as np

In [3]:
filepath = r"D:\DevRoot\DataSet\mysql\world\country.csv"

In [4]:
df = pd.read_csv(filepath)
df

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,Aruba,Nonmetropolitan Territory of The Netherlands,Beatrix,129.0,AW
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
2,AGO,Angola,Africa,Central Africa,1246700.0,1975.0,12878000,38.3,6648.0,7984.0,Angola,Republic,JosÃ© Eduardo dos Santos,56.0,AO
3,AIA,Anguilla,North America,Caribbean,96.0,,8000,76.1,63.2,,Anguilla,Dependent Territory of the UK,Elisabeth II,62.0,AI
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,YEM,Yemen,Asia,Middle East,527968.0,1918.0,18112000,59.8,6041.0,5729.0,Al-Yaman,Republic,Ali Abdallah Salih,1780.0,YE
235,YUG,Yugoslavia,Europe,Southern Europe,102173.0,1918.0,10640000,72.4,17000.0,,Jugoslavija,Federal Republic,Vojislav KoÂštunica,1792.0,YU
236,ZAF,South Africa,Africa,Southern Africa,1221037.0,1910.0,40377000,51.1,116729.0,129092.0,South Africa,Republic,Thabo Mbeki,716.0,ZA
237,ZMB,Zambia,Africa,Eastern Africa,752618.0,1964.0,9169000,37.2,3377.0,3922.0,Zambia,Republic,Frederick Chiluba,3162.0,ZM


## aggregate(집계) 함수 사용하기
 - groupby 결과에 '집계함수'를 적용하여 그룹별 데이터 확인 가능
 - ex) sum(), count(), mean(), max() ...

In [5]:
df.mean() # '수치타입' 모든 컬럼에 대한 평균

SurfaceArea       6.232481e+05
IndepYear         1.847260e+03
Population        2.543410e+07
LifeExpectancy    6.648604e+01
GNP               1.228239e+05
GNPOld            1.655343e+05
Capital           2.071306e+03
dtype: float64

In [6]:
df.sum()

Code              ABWAFGAGOAIAALBANDANTAREARGARMASMATAATFATGAUSA...
Name              ArubaAfghanistanAngolaAnguillaAlbaniaAndorraNe...
Continent         North AmericaAsiaAfricaNorth AmericaEuropeEuro...
Region            CaribbeanSouthern and Central AsiaCentral Afri...
SurfaceArea                                             1.48956e+08
IndepYear                                                    354674
Population                                               6078749450
LifeExpectancy                                              14759.9
GNP                                                     2.93549e+07
GNPOld                                                  2.94651e+07
LocalName         ArubaAfganistan/AfqanestanAngolaAnguillaShqipÃ...
GovernmentForm    Nonmetropolitan Territory of The NetherlandsIs...
Capital                                                      480543
dtype: object

### 대륙별 grouping
split!

In [7]:
df['Continent']

0      North America
1               Asia
2             Africa
3      North America
4             Europe
           ...      
234             Asia
235           Europe
236           Africa
237           Africa
238           Africa
Name: Continent, Length: 239, dtype: object

In [8]:
df.Continent

0      North America
1               Asia
2             Africa
3      North America
4             Europe
           ...      
234             Asia
235           Europe
236           Africa
237           Africa
238           Africa
Name: Continent, Length: 239, dtype: object

In [9]:
# 대륙은 총 몇가지가 있나?
df.Continent.unique()  # 7개

array(['North America', 'Asia', 'Africa', 'Europe', 'South America',
       'Oceania', 'Antarctica'], dtype=object)

In [11]:
# 대륙별로 쪼개기 (grouping) --> split!
gc = df.groupby("Continent")
gc.size()

Continent
Africa           58
Antarctica        5
Asia             51
Europe           46
North America    37
Oceania          28
South America    14
dtype: int64

In [12]:
gc = df.groupby(df.Continent)
gc.size()

Continent
Africa           58
Antarctica        5
Asia             51
Europe           46
North America    37
Oceania          28
South America    14
dtype: int64

In [13]:
len(gc)

7

In [14]:
gc.groups # dict  그룹명: [index ...]

{'Africa': [2, 17, 19, 20, 35, 36, 42, 43, 44, 45, 48, 49, 57, 61, 63, 64, 65, 68, 75, 78, 80, 82, 83, 84, 100, 111, 120, 121, 125, 130, 133, 138, 143, 144, 147, 148, 150, 151, 153, 155, 179, 182, 184, 185, 188, 191, 194, 196, 201, 202, 205, 206, 214, 218, 219, 236, 237, 238], 'Antarctica': [11, 12, 34, 93, 187], 'Asia': [1, 7, 9, 16, 21, 23, 32, 33, 41, 54, 77, 92, 98, 99, 102, 103, 105, 108, 109, 110, 112, 113, 116, 117, 118, 119, 124, 129, 134, 140, 141, 149, 160, 163, 164, 168, 173, 176, 178, 183, 186, 203, 207, 208, 210, 211, 215, 217, 224, 230, 234], 'Europe': [4, 5, 15, 18, 22, 25, 26, 39, 55, 56, 59, 66, 67, 69, 72, 73, 76, 79, 85, 95, 97, 101, 104, 106, 123, 126, 127, 128, 131, 132, 137, 139, 158, 159, 171, 174, 180, 181, 189, 193, 198, 199, 200, 220, 225, 235], 'North America': [0, 3, 6, 13, 24, 27, 28, 31, 37, 50, 51, 53, 58, 60, 81, 86, 87, 88, 94, 96, 107, 115, 122, 135, 145, 146, 156, 165, 172, 192, 195, 204, 213, 223, 226, 228, 229], 'Oceania': [10, 14, 38, 46, 52, 70, 7

In [16]:
gc.get_group('Asia').head()

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
7,ARE,United Arab Emirates,Asia,Middle East,83600.0,1971.0,2441000,74.1,37966.0,36846.0,Al-Imarat al-Â´Arabiya al-Muttahida,Emirate Federation,Zayid bin Sultan al-Nahayan,65.0,AE
9,ARM,Armenia,Asia,Middle East,29800.0,1991.0,3520000,66.4,1813.0,1627.0,Hajastan,Republic,Robert KotÂšarjan,126.0,AM
16,AZE,Azerbaijan,Asia,Middle East,86600.0,1991.0,7734000,62.9,4127.0,4100.0,AzÃ¤rbaycan,Federal Republic,HeydÃ¤r Ã„liyev,144.0,AZ
21,BGD,Bangladesh,Asia,Southern and Central Asia,143998.0,1971.0,129155000,60.2,32852.0,31966.0,Bangladesh,Republic,Shahabuddin Ahmad,150.0,BD


In [18]:
gc.get_group("Europe").head()

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL
5,AND,Andorra,Europe,Southern Europe,468.0,1278.0,78000,83.5,1630.0,,Andorra,Parliamentary Coprincipality,,55.0,AD
15,AUT,Austria,Europe,Western Europe,83859.0,1918.0,8091800,77.7,211860.0,206025.0,Ã–sterreich,Federal Republic,Thomas Klestil,1523.0,AT
18,BEL,Belgium,Europe,Western Europe,30518.0,1830.0,10239000,77.8,249704.0,243948.0,BelgiÃ«/Belgique,"Constitutional Monarchy, Federation",Albert II,179.0,BE
22,BGR,Bulgaria,Europe,Eastern Europe,110994.0,1908.0,8190900,70.9,12178.0,10169.0,Balgarija,Republic,Petar Stojanov,539.0,BG


#### groupby 객체 도 iterable 하다

In [19]:
for group in gc:
    # 위에서 group 은 tuple 이 나온다 이다.
    # 위 tuple 은 두개의 값으로 이루어져 있다. 
    # [0] : group 이름
    # [1] : group 데이터 : DataFrame
    print(group[0])

Africa
Antarctica
Asia
Europe
North America
Oceania
South America


In [20]:
for groupid, group_df in gc:
    print(groupid)

Africa
Antarctica
Asia
Europe
North America
Oceania
South America


## 대륙 x 지역별 그룹핑
복잡한 그룹핑

In [21]:
df.groupby(['Continent', 'Region']).size()

Continent      Region                   
Africa         Central Africa                9
               Eastern Africa               20
               Northern Africa               7
               Southern Africa               5
               Western Africa               17
Antarctica     Antarctica                    5
Asia           Eastern Asia                  8
               Middle East                  18
               Southeast Asia               11
               Southern and Central Asia    14
Europe         Baltic Countries              3
               British Islands               2
               Eastern Europe               10
               Nordic Countries              7
               Southern Europe              15
               Western Europe                9
North America  Caribbean                    24
               Central America               8
               North America                 5
Oceania        Australia and New Zealand     5
               Mela

In [23]:
for group_name, group_df in df.groupby(['Continent', 'Region']):
    print(group_name)
    print(group_df.shape)

('Africa', 'Central Africa')
(9, 15)
('Africa', 'Eastern Africa')
(20, 15)
('Africa', 'Northern Africa')
(7, 15)
('Africa', 'Southern Africa')
(5, 15)
('Africa', 'Western Africa')
(17, 15)
('Antarctica', 'Antarctica')
(5, 15)
('Asia', 'Eastern Asia')
(8, 15)
('Asia', 'Middle East')
(18, 15)
('Asia', 'Southeast Asia')
(11, 15)
('Asia', 'Southern and Central Asia')
(14, 15)
('Europe', 'Baltic Countries')
(3, 15)
('Europe', 'British Islands')
(2, 15)
('Europe', 'Eastern Europe')
(10, 15)
('Europe', 'Nordic Countries')
(7, 15)
('Europe', 'Southern Europe')
(15, 15)
('Europe', 'Western Europe')
(9, 15)
('North America', 'Caribbean')
(24, 15)
('North America', 'Central America')
(8, 15)
('North America', 'North America')
(5, 15)
('Oceania', 'Australia and New Zealand')
(5, 15)
('Oceania', 'Melanesia')
(5, 15)
('Oceania', 'Micronesia')
(7, 15)
('Oceania', 'Micronesia/Caribbean')
(1, 15)
('Oceania', 'Polynesia')
(10, 15)
('South America', 'South America')
(14, 15)


### GroupBy 에 적용 가능한 집계함수(aggregate func)들
sum(), count(), mean() 등 적용 가능

- 이 과정 또한 split-apply-combine

In [24]:
gc.mean()

Unnamed: 0_level_0,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,Capital
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,521558.2,1904.90566,13525430.0,52.57193,10006.465517,11065.28,2162.368421
Antarctica,2626420.0,,0.0,,0.0,,
Asia,625117.7,1808.531915,72647560.0,67.441176,150105.72549,183377.088889,2038.803922
Europe,501068.1,1750.488372,15871190.0,75.147727,206497.065217,260611.138889,2223.782609
North America,654445.1,1903.608696,13053860.0,72.991892,261854.789189,371527.0,1592.810811
Oceania,305867.6,1967.285714,1085755.0,69.715,14991.953571,46905.8,2498.074074
South America,1276066.0,1843.083333,24698570.0,70.946154,107991.0,126720.083333,1759.5


In [25]:
gc.count()

Unnamed: 0_level_0,Code,Name,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Africa,58,58,58,58,53,58,57,58,50,58,58,58,57,57
Antarctica,5,5,5,5,0,5,0,5,0,5,5,4,0,5
Asia,51,51,51,51,47,51,51,51,45,51,51,51,51,51
Europe,46,46,46,46,43,46,44,46,36,46,46,44,46,46
North America,37,37,37,37,23,37,37,37,25,37,37,37,37,37
Oceania,28,28,28,28,14,28,20,28,10,28,28,28,27,28
South America,14,14,14,14,12,14,13,14,12,14,14,14,14,14


In [26]:
gc.sum()

Unnamed: 0_level_0,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,Capital
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,30250377.0,100960.0,784475000,2996.6,580375.0,553264.0,123255.0
Antarctica,13132101.0,0.0,0,0.0,0.0,0.0,0.0
Asia,31881005.0,85001.0,3705025700,3439.5,7655392.0,8251969.0,103979.0
Europe,23049133.9,75271.0,730074600,3306.5,9498865.0,9382001.0,102294.0
North America,24214470.0,43783.0,482993000,2700.7,9688627.2,9288175.0,58934.0
Oceania,8564294.0,27542.0,30401150,1394.3,419774.7,469058.0,67448.0
South America,17864926.0,22117.0,345780000,922.3,1511874.0,1520641.0,24633.0


In [28]:
gc.std()

Unnamed: 0_level_0,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,Capital
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,628863.2,407.199217,20071920.0,10.751939,21436.51,22997.91,1089.66862
Antarctica,5866090.0,,0.0,,0.0,,
Asia,1466192.0,631.614937,224850400.0,8.396527,543118.1,632797.4,1133.096342
Europe,2503180.0,370.226917,27773960.0,4.484884,440929.0,473988.3,1169.124761
North America,2235040.0,74.378479,47864300.0,5.536263,1398694.0,1618781.0,1297.572538
Oceania,1460543.0,28.290875,3665679.0,5.974091,66685.18,123218.0,1070.36284
South America,2217233.0,59.865696,44121300.0,4.688393,212284.3,231247.7,1385.014954


## 대륙별 면적 평균

In [31]:
df.groupby('Continent').mean()[['SurfaceArea']]

# split : Continent 별로 쪼개어짐
# apply : 쪼개어진 group 별로 평균(mean) 집계합수 적용
# combine : 하나의 DataFrame 으로 합계

Unnamed: 0_level_0,SurfaceArea
Continent,Unnamed: 1_level_1
Africa,521558.2
Antarctica,2626420.0
Asia,625117.7
Europe,501068.1
North America,654445.1
Oceania,305867.6
South America,1276066.0


## 대륙별 인구수 합계 (방법1)
- 집계함수 사용
- 인구수 내림차순 정렬

In [32]:
gc

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002C21FF6AD60>

In [36]:
aaa = gc.sum()[['Population']].sort_values(['Population'], ascending=False)
aaa

Unnamed: 0_level_0,Population
Continent,Unnamed: 1_level_1
Asia,3705025700
Africa,784475000
Europe,730074600
North America,482993000
South America,345780000
Oceania,30401150
Antarctica,0


### 대륙별 국가수 x 인구총수
- 컬럼명을 '국가수' 로 바꾸기

In [39]:
ddd = gc.count()[['Name']]
ddd.columns = ['국가수']
ddd

Unnamed: 0_level_0,국가수
Continent,Unnamed: 1_level_1
Africa,58
Antarctica,5
Asia,51
Europe,46
North America,37
Oceania,28
South America,14


In [40]:
df_merge = pd.merge(aaa, ddd, on="Continent")
df_merge

Unnamed: 0_level_0,Population,국가수
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,3705025700,51
Africa,784475000,58
Europe,730074600,46
North America,482993000,37
South America,345780000,14
Oceania,30401150,28
Antarctica,0,5


## 대륙별 인구수 합계 (방법2)
- agg() + 집계함수 사용

In [41]:
# groupby 객체에 agg() 적용하면 간단하게 결과 나옴!

In [42]:
gc.agg({"Population": "sum"})

# 이 또한 split-apply-combine

Unnamed: 0_level_0,Population
Continent,Unnamed: 1_level_1
Africa,784475000
Antarctica,0
Asia,3705025700
Europe,730074600
North America,482993000
Oceania,30401150
South America,345780000


In [43]:
gc.agg({"Population": np.sum})

Unnamed: 0_level_0,Population
Continent,Unnamed: 1_level_1
Africa,784475000
Antarctica,0
Asia,3705025700
Europe,730074600
North America,482993000
Oceania,30401150
South America,345780000


In [44]:
gc.aggregate({"Population": np.sum})

Unnamed: 0_level_0,Population
Continent,Unnamed: 1_level_1
Africa,784475000
Antarctica,0
Asia,3705025700
Europe,730074600
North America,482993000
Oceania,30401150
South America,345780000


In [45]:
# agg() 를 하나의 컬럼에 여러 집계함수 사용 가능!
gc.agg({"Population" : ["sum", "mean"]})

# 결과는 multi column

Unnamed: 0_level_0,Population,Population
Unnamed: 0_level_1,sum,mean
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2
Africa,784475000,13525430.0
Antarctica,0,0.0
Asia,3705025700,72647560.0
Europe,730074600,15871190.0
North America,482993000,13053860.0
Oceania,30401150,1085755.0
South America,345780000,24698570.0


In [46]:
gc.agg({"Population" : [np.sum, np.mean]})

Unnamed: 0_level_0,Population,Population
Unnamed: 0_level_1,sum,mean
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2
Africa,784475000,13525430.0
Antarctica,0,0.0
Asia,3705025700,72647560.0
Europe,730074600,15871190.0
North America,482993000,13053860.0
Oceania,30401150,1085755.0
South America,345780000,24698570.0


In [None]:
# groupby(...) <-- 여기세는 함수, 리스트 등 여러가지 형태가 들어갈수 있다.

In [None]:
# 그러나
# 쇼핑몰 일별 매출 같은 것을 하면 위와 같은 groupby() 로는 힘들다
# 왜냐하면

# 타임스탬프가 아래와 같이 찍혀 있을테니..

# 2017-06-24-12:22:00:00
# 2017-06-24-12:22:00:00
# 2017-06-24-12:22:00:05
# 2017-06-24-12:22:00:04
# 2017-06-24-12:22:00:03
# 2017-06-24-12:22:00:02
# 2017-06-24-12:22:00:01

# 일별 집계를 하려면,  앞의 년-월-일 만 으로 그룹한다든지 해야 한다

# 각각의 알파벳으로 시작하는 국가들은 얼마나 될까?

In [47]:
# 각각의 알파벳으로 시작하는 국가들이 얼마나 될까?
# a ... 100
# b ... ?
# c ... ?

In [48]:
df.Name

0             Aruba
1       Afghanistan
2            Angola
3          Anguilla
4           Albania
           ...     
234           Yemen
235      Yugoslavia
236    South Africa
237          Zambia
238        Zimbabwe
Name: Name, Length: 239, dtype: object

In [49]:
# Series.apply() ==> map() 함수와 유사

In [50]:
#df.Name.apply(함수)
df.Name.apply(lambda name : name[0])

0      A
1      A
2      A
3      A
4      A
      ..
234    Y
235    Y
236    S
237    Z
238    Z
Name: Name, Length: 239, dtype: object

In [51]:
# groupby() 매개변수로 Series!
# 다른 연산을 거친 Series 결과를 groupby() 에 넘겨주어, 다양한 그룹핑 가능

grouped = df.groupby(df.Name.apply(lambda name : name[0]))

grouped.size()

Name
A    15
B    20
C    22
D     4
E     8
F     8
G    15
H     6
I     8
J     3
K     5
L     9
M    22
N    15
O     1
P    12
Q     1
R     4
S    29
T    13
U     8
V     5
W     2
Y     2
Z     2
dtype: int64

In [53]:
# "A" 로 시작하는 국가들
grouped.get_group("A").head()

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,Aruba,Nonmetropolitan Territory of The Netherlands,Beatrix,129.0,AW
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
2,AGO,Angola,Africa,Central Africa,1246700.0,1975.0,12878000,38.3,6648.0,7984.0,Angola,Republic,JosÃ© Eduardo dos Santos,56.0,AO
3,AIA,Anguilla,North America,Caribbean,96.0,,8000,76.1,63.2,,Anguilla,Dependent Territory of the UK,Elisabeth II,62.0,AI
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL


In [54]:
#groupby() 에 column 이름을 넣기도 하지만,
#대부분의 경우는 '추가적인 연산'을 수행해서  이 연산된 결과를 바탕으로 해서 하는 경우가 많습니다.
#(즉, 위와 같이 어떤 알파벳으로 시작하는지에 대한 연산)

# 대륙별 인구가 가장 많은 국가 1개씩

In [55]:
gc.size()

Continent
Africa           58
Antarctica        5
Asia             51
Europe           46
North America    37
Oceania          28
South America    14
dtype: int64

In [57]:
df.head()

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,Aruba,Nonmetropolitan Territory of The Netherlands,Beatrix,129.0,AW
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
2,AGO,Angola,Africa,Central Africa,1246700.0,1975.0,12878000,38.3,6648.0,7984.0,Angola,Republic,JosÃ© Eduardo dos Santos,56.0,AO
3,AIA,Anguilla,North America,Caribbean,96.0,,8000,76.1,63.2,,Anguilla,Dependent Territory of the UK,Elisabeth II,62.0,AI
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL


In [56]:
# 원래 DataFrame.head() 는 위에 있는 5개 row 를 보여주나,  
# GroupBy.head() 를 사용하면, 각각의 그룹에서 위에 있는 5개씩 뽑습니다.
gc.head()

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,Aruba,Nonmetropolitan Territory of The Netherlands,Beatrix,129.0,AW
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
2,AGO,Angola,Africa,Central Africa,1246700.0,1975.0,12878000,38.3,6648.0,7984.0,Angola,Republic,JosÃ© Eduardo dos Santos,56.0,AO
3,AIA,Anguilla,North America,Caribbean,96.0,,8000,76.1,63.2,,Anguilla,Dependent Territory of the UK,Elisabeth II,62.0,AI
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL
5,AND,Andorra,Europe,Southern Europe,468.0,1278.0,78000,83.5,1630.0,,Andorra,Parliamentary Coprincipality,,55.0,AD
6,ANT,Netherlands Antilles,North America,Caribbean,800.0,,217000,74.7,1941.0,,Nederlandse Antillen,Nonmetropolitan Territory of The Netherlands,Beatrix,33.0,AN
7,ARE,United Arab Emirates,Asia,Middle East,83600.0,1971.0,2441000,74.1,37966.0,36846.0,Al-Imarat al-Â´Arabiya al-Muttahida,Emirate Federation,Zayid bin Sultan al-Nahayan,65.0,AE
8,ARG,Argentina,South America,South America,2780400.0,1816.0,37032000,75.1,340238.0,323310.0,Argentina,Federal Republic,Fernando de la RÃºa,69.0,AR
9,ARM,Armenia,Asia,Middle East,29800.0,1991.0,3520000,66.4,1813.0,1627.0,Hajastan,Republic,Robert KotÂšarjan,126.0,AM


In [58]:
gc.head(1) # 각각의 그룹에서 처음 row 1개씩

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,Aruba,Nonmetropolitan Territory of The Netherlands,Beatrix,129.0,AW
1,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,Afganistan/Afqanestan,Islamic Emirate,Mohammad Omar,1.0,AF
2,AGO,Angola,Africa,Central Africa,1246700.0,1975.0,12878000,38.3,6648.0,7984.0,Angola,Republic,JosÃ© Eduardo dos Santos,56.0,AO
4,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,3401200,71.6,3205.0,2500.0,ShqipÃ«ria,Republic,Rexhep Mejdani,34.0,AL
8,ARG,Argentina,South America,South America,2780400.0,1816.0,37032000,75.1,340238.0,323310.0,Argentina,Federal Republic,Fernando de la RÃºa,69.0,AR
10,ASM,American Samoa,Oceania,Polynesia,199.0,,68000,75.1,334.0,,Amerika Samoa,US Territory,George W. Bush,54.0,AS
11,ATA,Antarctica,Antarctica,Antarctica,13120000.0,,0,,0.0,,Â–,Co-administrated,,,AQ


In [62]:
# 대륙별 인구가 가장 많은 국가 1개씩 뽑기
df.sort_values("Population", ascending=False).groupby("Continent").head(1)

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
41,CHN,China,Asia,Eastern Asia,9572900.0,-1523.0,1277558000,71.4,982268.0,917719.0,Zhongquo,People'sRepublic,Jiang Zemin,1891.0,CN
223,USA,United States,North America,North America,9363520.0,1776.0,278357000,77.1,8510700.0,8110900.0,United States,Federal Republic,George W. Bush,3813.0,US
30,BRA,Brazil,South America,South America,8547403.0,1822.0,170115000,62.9,776739.0,804108.0,Brasil,Federal Republic,Fernando Henrique Cardoso,211.0,BR
181,RUS,Russian Federation,Europe,Eastern Europe,17075400.0,1991.0,146934000,67.2,276608.0,442989.0,Rossija,Federal Republic,Vladimir Putin,3580.0,RU
155,NGA,Nigeria,Africa,Western Africa,923768.0,1960.0,111506000,51.6,65707.0,58623.0,Nigeria,Federal Republic,Olusegun Obasanjo,2754.0,NG
14,AUS,Australia,Oceania,Australia and New Zealand,7741220.0,1901.0,18886000,79.8,351182.0,392911.0,Australia,"Constitutional Monarchy, Federation",Elisabeth II,135.0,AU
11,ATA,Antarctica,Antarctica,Antarctica,13120000.0,,0,,0.0,,Â–,Co-administrated,,,AQ


# 대륙별, 인구합계,인구평균, 면적합계 
- 대륙별 
    - 인구는 "합계","평균"
    - 면적은 "합계 " 

In [64]:
gc.agg("sum")
gc.agg(np.sum)

Unnamed: 0_level_0,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,Capital
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africa,30250377.0,100960.0,784475000,2996.6,580375.0,553264.0,123255.0
Antarctica,13132101.0,0.0,0,0.0,0.0,0.0,0.0
Asia,31881005.0,85001.0,3705025700,3439.5,7655392.0,8251969.0,103979.0
Europe,23049133.9,75271.0,730074600,3306.5,9498865.0,9382001.0,102294.0
North America,24214470.0,43783.0,482993000,2700.7,9688627.2,9288175.0,58934.0
Oceania,8564294.0,27542.0,30401150,1394.3,419774.7,469058.0,67448.0
South America,17864926.0,22117.0,345780000,922.3,1511874.0,1520641.0,24633.0


In [65]:
# 대륙별 '인구' 에 대해서만 sum 구하기
gc.agg('sum')['Population']

Continent
Africa            784475000
Antarctica                0
Asia             3705025700
Europe            730074600
North America     482993000
Oceania            30401150
South America     345780000
Name: Population, dtype: int64

In [68]:
# GroupBy 객체에 특정 column 만 뽑아낼수 있다!!
gc['Population'].agg('sum')  # 위와 동일한 결과!

Continent
Africa            784475000
Antarctica                0
Asia             3705025700
Europe            730074600
North America     482993000
Oceania            30401150
South America     345780000
Name: Population, dtype: int64

In [69]:
df.describe()

Unnamed: 0,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,Capital
count,239.0,192.0,239.0,222.0,239.0,178.0,232.0
mean,623248.1,1847.260417,25434100.0,66.486036,122823.9,165534.3,2071.306034
std,1924140.0,420.83137,109339800.0,11.519267,637997.6,720468.9,1184.095609
min,0.4,-1523.0,0.0,37.2,0.0,157.0,1.0
25%,2275.0,1906.75,238000.0,60.3,640.0,2187.0,915.75
50%,71740.0,1960.0,3869000.0,70.15,4787.0,8421.0,2449.5
75%,398754.5,1974.0,14935500.0,75.5,29944.5,71145.5,3065.25
max,17075400.0,1994.0,1277558000.0,83.5,8510700.0,8110900.0,4074.0


In [70]:
# describe() 연산
gc.agg("describe")  # describe 가 numpy 에 있는 기술통계량 집계 연산들 (count, mean, std, min .... ) 을 다 해준다!!
                    #그룹별로 전체적인 데이터 모양새를 보고 싶을때 활용하면 좋다

Unnamed: 0_level_0,SurfaceArea,SurfaceArea,SurfaceArea,SurfaceArea,SurfaceArea,SurfaceArea,SurfaceArea,SurfaceArea,IndepYear,IndepYear,...,GNPOld,GNPOld,Capital,Capital,Capital,Capital,Capital,Capital,Capital,Capital
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Africa,58.0,521558.2,628863.2,78.0,28627.0,266834.0,818616.5,2505813.0,53.0,1904.90566,...,7835.0,129092.0,57.0,2162.368421,1089.66862,35.0,926.0,2462.0,3063.0,4068.0
Antarctica,5.0,2626420.0,5866090.0,59.0,359.0,3903.0,7780.0,13120000.0,0.0,,...,,,0.0,,,,,,,
Asia,51.0,625117.7,1466192.0,18.0,25428.0,147181.0,500607.5,9572900.0,47.0,1808.531915,...,146171.0,4192638.0,51.0,2038.803922,1133.096342,1.0,1237.0,2331.0,2902.0,4074.0
Europe,46.0,501068.1,2503180.0,0.4,29190.5,64945.0,230693.25,17075400.0,43.0,1750.488372,...,231804.75,2102826.0,46.0,2223.782609,1169.124761,5.0,1447.5,2472.0,3199.5,3791.0
North America,37.0,654445.1,2235040.0,53.0,347.0,1705.0,51100.0,9970610.0,23.0,1903.608696,...,15076.0,8110900.0,37.0,1592.810811,1297.572538,33.0,553.0,929.0,2882.0,4067.0
Oceania,28.0,305867.6,1460543.0,12.0,113.5,461.5,6047.25,7741220.0,14.0,1967.285714,...,5283.25,392911.0,27.0,2498.074074,1070.36284,54.0,2286.5,2881.0,3251.0,3537.0
South America,14.0,1276066.0,2217233.0,12173.0,185004.25,581689.0,1128830.75,8547403.0,12.0,1843.083333,...,92604.5,804108.0,14.0,1759.5,1385.014954,69.0,564.0,1592.5,2983.0,3539.0


In [71]:
# agg() 에 list 적용가능
gc.agg(['sum', 'mean', 'max']) # 각각의 column 에 대해서 sum 연산, mean 연산, max 연산 수행

Unnamed: 0_level_0,SurfaceArea,SurfaceArea,SurfaceArea,IndepYear,IndepYear,IndepYear,Population,Population,Population,LifeExpectancy,LifeExpectancy,LifeExpectancy,GNP,GNP,GNP,GNPOld,GNPOld,GNPOld,Capital,Capital,Capital
Unnamed: 0_level_1,sum,mean,max,sum,mean,max,sum,mean,max,sum,...,max,sum,mean,max,sum,mean,max,sum,mean,max
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Africa,30250377.0,521558.2,2505813.0,100960.0,1904.90566,1993.0,784475000,13525430.0,111506000,2996.6,...,76.8,580375.0,10006.465517,116729.0,553264.0,11065.28,129092.0,123255.0,2162.368421,4068.0
Antarctica,13132101.0,2626420.0,13120000.0,0.0,,,0,0.0,0,0.0,...,,0.0,0.0,0.0,0.0,,,0.0,,
Asia,31881005.0,625117.7,9572900.0,85001.0,1808.531915,1991.0,3705025700,72647560.0,1277558000,3439.5,...,81.6,7655392.0,150105.72549,3787042.0,8251969.0,183377.088889,4192638.0,103979.0,2038.803922,4074.0
Europe,23049133.9,501068.1,17075400.0,75271.0,1750.488372,1993.0,730074600,15871190.0,146934000,3306.5,...,83.5,9498865.0,206497.065217,2133367.0,9382001.0,260611.138889,2102826.0,102294.0,2223.782609,3791.0
North America,24214470.0,654445.1,9970610.0,43783.0,1903.608696,1983.0,482993000,13053860.0,278357000,2700.7,...,79.4,9688627.2,261854.789189,8510700.0,9288175.0,371527.0,8110900.0,58934.0,1592.810811,4067.0
Oceania,8564294.0,305867.6,7741220.0,27542.0,1967.285714,1994.0,30401150,1085755.0,18886000,1394.3,...,79.8,419774.7,14991.953571,351182.0,469058.0,46905.8,392911.0,67448.0,2498.074074,3537.0
South America,17864926.0,1276066.0,8547403.0,22117.0,1843.083333,1975.0,345780000,24698570.0,170115000,922.3,...,76.1,1511874.0,107991.0,776739.0,1520641.0,126720.083333,804108.0,24633.0,1759.5,3539.0


In [72]:
# 대륙별, 
# 인구는 "합계","평균"을
# 면적은 "합계 " 만.  

# df.groupby("Continent").agg({})         agg() 에 dict 적용하기
# { "칼럼명" : "연산" }  <- 요런 형식의 dict 를 적용한다.


gc.agg({
    "Population" : ['sum', 'mean'],
    "SurfaceArea" : ["sum"]
})

Unnamed: 0_level_0,Population,Population,SurfaceArea
Unnamed: 0_level_1,sum,mean,sum
Continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,784475000,13525430.0,30250377.0
Antarctica,0,0.0,13132101.0
Asia,3705025700,72647560.0,31881005.0
Europe,730074600,15871190.0,23049133.9
North America,482993000,13053860.0,24214470.0
Oceania,30401150,1085755.0,8564294.0
South America,345780000,24698570.0,17864926.0
