# Fusión (merging/join) de datasets

* *60:00 min* | Última modificación: Agosto 11, 2021 | YouTube

In [1]:
import pandas as pd

## Fusión uno-a-uno

In [2]:
%%writefile /tmp/dataset_1.csv
clientId,name,location
10,Omar Y. Fletcher,6833 Mollis. Rd.
11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue"
12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd."
13,Ferris Q. Le,Ap #791-3809 Eu Street
14,Michael I. Gray,6715 Diam. Rd.
15,Alan T. Mullins,512-3640 Nisl Rd.
16,Morgan W. Zamora,416-6030 Vivamus Road
17,Lilah O. Morrison,3859 Mauris Ave
18,Chantale Z. Kelley,3433 Arcu. Rd.
19,Randall Q. Mcclure,Ap #584-7470 Nibh. Ave

Overwriting /tmp/dataset_1.csv


In [3]:
%%writefile /tmp/dataset_2.csv
clientId,name,location
20,Baker C. Hurst,249-6250 Velit. Rd.
21,Bevis W. Molina,"P.O. Box 935, 1521 At, St."
22,Martina B. Schroeder,689-7600 Mi St.
23,Kylie G. Bailey,1430 Diam. Road
24,Steel K. Glover,"757-681 Et, Av."
25,Lucas M. Suarez,699-9329 Magna Rd.
26,Anastasia Q. Mccray,"P.O. Box 780, 4487 Lobortis, St."
27,Germaine Q. Henson,589-4921 Duis Ave
28,Wilma U. Mcfadden,6917 Dictum Rd.
29,Merritt Q. Martinez,"P.O. Box 469, 7833 Euismod Av."

Overwriting /tmp/dataset_2.csv


In [4]:
%%writefile /tmp/bonus.csv
clientId,bonus
10,279
13,160
14,267
27,215
16,169
17,263
18,233
19,292
20,208
21,247
22,240
23,161
28,463
29,790
11,138
12,227
15,231
24,248
25,216
26,291

Overwriting /tmp/bonus.csv


In [5]:
dataset_1 = pd.read_csv("/tmp/dataset_1.csv")
dataset_2 = pd.read_csv("/tmp/dataset_2.csv")
datasets_12 = pd.concat(
    [
        dataset_1,
        dataset_2,
    ]
)
datasets_12.head()

Unnamed: 0,clientId,name,location
0,10,Omar Y. Fletcher,6833 Mollis. Rd.
1,11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue"
2,12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd."
3,13,Ferris Q. Le,Ap #791-3809 Eu Street
4,14,Michael I. Gray,6715 Diam. Rd.


In [6]:
bonus = pd.read_csv("/tmp/bonus.csv")
bonus.head()

Unnamed: 0,clientId,bonus
0,10,279
1,13,160
2,14,267
3,27,215
4,16,169


In [7]:
#
# Fusión de los datasets.
# La especificación del parámetro 'on' es 
# opcional, pero es util cuando dos o mas
# campos coinciden en ambas tablas.
#
dataset_full = pd.merge(datasets_12, bonus, on="clientId")
dataset_full

Unnamed: 0,clientId,name,location,bonus
0,10,Omar Y. Fletcher,6833 Mollis. Rd.,279
1,11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue",138
2,12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd.",227
3,13,Ferris Q. Le,Ap #791-3809 Eu Street,160
4,14,Michael I. Gray,6715 Diam. Rd.,267
5,15,Alan T. Mullins,512-3640 Nisl Rd.,231
6,16,Morgan W. Zamora,416-6030 Vivamus Road,169
7,17,Lilah O. Morrison,3859 Mauris Ave,263
8,18,Chantale Z. Kelley,3433 Arcu. Rd.,233
9,19,Randall Q. Mcclure,Ap #584-7470 Nibh. Ave,292


## Fusión uno-a-uno con registros incompletos

In [8]:
%%writefile /tmp/bonus.csv
clientId,bonus
10,279
13,160
27,215
16,169
17,263
18,233
20,208
21,247
22,240
28,463
11,138
12,227
24,248
26,291

Overwriting /tmp/bonus.csv


In [9]:
#
# Note que merge solo retorno los registros con
# información completa
#
bonus = pd.read_csv("/tmp/bonus.csv")
dataset_full = pd.merge(datasets_12, bonus, on="clientId")
dataset_full

Unnamed: 0,clientId,name,location,bonus
0,10,Omar Y. Fletcher,6833 Mollis. Rd.,279
1,11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue",138
2,12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd.",227
3,13,Ferris Q. Le,Ap #791-3809 Eu Street,160
4,16,Morgan W. Zamora,416-6030 Vivamus Road,169
5,17,Lilah O. Morrison,3859 Mauris Ave,263
6,18,Chantale Z. Kelley,3433 Arcu. Rd.,233
7,20,Baker C. Hurst,249-6250 Velit. Rd.,208
8,21,Bevis W. Molina,"P.O. Box 935, 1521 At, St.",247
9,22,Martina B. Schroeder,689-7600 Mi St.,240


## Fusión muchos-a-uno

In [10]:
%%writefile /tmp/sales.csv
clientId,month,sales
10,jan,1239
10,feb,387
11,jan,454
11,mar,495
11,sep,145
12,may,4959
12,dec,493
12,oct,4981
12,jan,484
15,may,394
15,sep,585
17,feb,487
17,jun,455
19,dec,948

Overwriting /tmp/sales.csv


In [11]:
sales = pd.read_csv("/tmp/sales.csv")
sales.head()

Unnamed: 0,clientId,month,sales
0,10,jan,1239
1,10,feb,387
2,11,jan,454
3,11,mar,495
4,11,sep,145


In [12]:
dataset_1 = pd.read_csv("/tmp/dataset_1.csv")
dataset_1

Unnamed: 0,clientId,name,location
0,10,Omar Y. Fletcher,6833 Mollis. Rd.
1,11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue"
2,12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd."
3,13,Ferris Q. Le,Ap #791-3809 Eu Street
4,14,Michael I. Gray,6715 Diam. Rd.
5,15,Alan T. Mullins,512-3640 Nisl Rd.
6,16,Morgan W. Zamora,416-6030 Vivamus Road
7,17,Lilah O. Morrison,3859 Mauris Ave
8,18,Chantale Z. Kelley,3433 Arcu. Rd.
9,19,Randall Q. Mcclure,Ap #584-7470 Nibh. Ave


In [13]:
pd.merge(sales, dataset_1[['clientId', 'name']])

Unnamed: 0,clientId,month,sales,name
0,10,jan,1239,Omar Y. Fletcher
1,10,feb,387,Omar Y. Fletcher
2,11,jan,454,Buffy W. Vincent
3,11,mar,495,Buffy W. Vincent
4,11,sep,145,Buffy W. Vincent
5,12,may,4959,Mira N. Franklin
6,12,dec,493,Mira N. Franklin
7,12,oct,4981,Mira N. Franklin
8,12,jan,484,Mira N. Franklin
9,15,may,394,Alan T. Mullins


## Fusión muchos-a-muchos

In [14]:
%%writefile /tmp/lines.csv
clientId,line
10,A
10,B
10,C
11,D
12,A
12,D
13,B
13,C
13,D
14,A
14,B
15,A
16,A
17,B
18,C
19,D

Overwriting /tmp/lines.csv


In [15]:
sales = pd.read_csv("/tmp/sales.csv")
sales.head()

Unnamed: 0,clientId,month,sales
0,10,jan,1239
1,10,feb,387
2,11,jan,454
3,11,mar,495
4,11,sep,145


In [16]:
lines = pd.read_csv("/tmp/lines.csv")
lines.head()

Unnamed: 0,clientId,line
0,10,A
1,10,B
2,10,C
3,11,D
4,12,A


In [17]:
pd.merge(sales, lines)

Unnamed: 0,clientId,month,sales,line
0,10,jan,1239,A
1,10,jan,1239,B
2,10,jan,1239,C
3,10,feb,387,A
4,10,feb,387,B
5,10,feb,387,C
6,11,jan,454,D
7,11,mar,495,D
8,11,sep,145,D
9,12,may,4959,A
