### 데이터셋 결합하기 : 병합과 조인

Pandas는 pd.merge() 함수를 활용하여 고성능 인메모리 조인과 병합연산을 할 수 있다

#### 조인 작업의 분류

pd.merge() 함수는 일대일, 다대일, 다대다 같은 여러가지 조인 유형을 구현한다.

In [56]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'employee':['Bob', 'Jake','Lisa','Sue'],
                   'group': ['Accounting', 'Engineering', 
                             'Engineering', 'HR']})

In [57]:
df2 = pd.DataFrame({'employee' : ['Lisa','Bob','Jake','Sue'], 
                    'hire_date': [2004, 2000, 2012, 2014]})

In [58]:
print(df1) ; print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2000
2     Jake       2012
3      Sue       2014


In [59]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2000
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


### 다대일(Many-to-one) 조인

병합하는 키 열 하나에 중복된 항목이 포함되는 경우의 조인

In [60]:
df4 = pd.DataFrame({'group':['Accounting','Engineering', 'HR'],
                             'supervisor':['Carly', 'Guido','Steve']})

In [61]:
print(df4) ; print(df3)

         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date
0      Bob   Accounting       2000
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014


In [62]:
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2000,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [63]:
### 다대다(Many-to-many) 조인

병합되는 두 데이터프레임에서 키열에 대해 모두 중복항목이 존재하면 다대다 조인이다.

In [64]:
df5 = pd.DataFrame({'group':
                    ['Accounting','Accounting',
                     'Engineering','Engineering','HR','HR'],
                   'skills':
                    ['math','spreadsheets','coding',
                     'linux','spreadsheets','organiation']})

In [65]:
print(df1) ; print(df5)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR   organiation


In [66]:
pd.merge(df1,df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organiation


#### 병합 키 지정

pd.merge()는 두개의 입력 데이터셋 사이에 일치하는 하나 이상의 열 이름을 찾아 그것을 키로 사용한다.

#### on 키워드

on 키워드를 사용해 키로 쓸 열이름을 명시적으로 지정할 수 있다.

In [67]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [68]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2000
2,Jake,2012
3,Sue,2014


In [69]:
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2000
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


#### left_on과 right_on 키워드

다른 열이름을 가진 두 데이터셋을 병합하려면, left_on과 right_on 키워드를 활용한다

In [70]:
df3 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
                   'salary':[70000,80000,120000,90000]})

In [71]:
print(df1); print(df3)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000


In [72]:
pd.merge(df1, df3, left_on="employee", right_on="name")

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Sue,HR,Sue,90000


In [73]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


### left_index와 right_index 키워드 그리고 join() 메서드

기본적으론 열을 기준으로 병합하지만 left_index, right_index를 활용하면 인덱스로 병합할 수 있다.

In [74]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [75]:
df1a = df1.set_index('employee')

In [76]:
df1a

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


In [77]:
df2a = df2.set_index('employee')

In [78]:
df2a

Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2000
Jake,2012
Sue,2014


In [79]:
pd.merge(df1a, df2a, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2000
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [80]:
pd.merge(df1a, df2a, left_index=True, right_index=True).reset_index()

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2000
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [81]:
df1a.join(df2a)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2000
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


### 인덱스와 열

두 데이터셋에서 하나는 인덱스로 다른 것은 열로 섞고자 하면 left_index와 right_on, left_on과 right_index를 조합할 수 있다.

In [82]:
print(df1a) ; print(df3)

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000


In [83]:
pd.merge(df1a, df3, left_index=True, right_on='name')

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000


In [84]:
print(df1) ; print(df1a)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR


##### Quiz) 두 개(df1, df1a)의 데이터 프레임을 병합해 주세요

In [85]:
pd.merge(df1, df1a, left_on='employee', right_index=True)

Unnamed: 0,employee,group_x,group_y
0,Bob,Accounting,Accounting
1,Jake,Engineering,Engineering
2,Lisa,Engineering,Engineering
3,Sue,HR,HR
