## Understanding the differences between concat, join, and merge

In [2]:
import pandas as pd
import numpy as np

In [3]:
years = 2016, 2017, 2018
stock_tables = [pd.read_csv(
    "../python_cookbook/data/stocks_{}.csv".format(year), index_col="Symbol") for year in years]
stocks_2016, stocks_2017, stocks_2018 = stock_tables

In [4]:
pd.concat(stock_tables, keys=[2016, 2017, 2018])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Unnamed: 0_level_1,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300
2018,AAPL,40,135,170


In [5]:
pd.concat(dict(zip(years, stock_tables)), axis="columns", join="outer")

Unnamed: 0_level_0,2016,2016,2016,2017,2017,2017,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [6]:
stocks_2016.join(stocks_2017, how="outer", lsuffix="_2016", rsuffix="_2017")

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [7]:
other = [stocks_2017.add_suffix("_2017"),
         stocks_2018.add_suffix("_2018")]
stocks_2017.add_suffix("_2016").join(other, how="outer")

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL,50.0,120.0,140.0,50.0,120.0,140.0,40.0,135.0,170.0
GE,100.0,30.0,40.0,100.0,30.0,40.0,,,
IBM,87.0,75.0,95.0,87.0,75.0,95.0,,,
SLB,20.0,55.0,85.0,20.0,55.0,85.0,,,
TXN,500.0,15.0,23.0,500.0,15.0,23.0,,,
TSLA,100.0,100.0,300.0,100.0,100.0,300.0,50.0,220.0,400.0
AMZN,,,,,,,8.0,900.0,1125.0


In [8]:
stocks_2016.merge(stocks_2017, left_index=True, right_index=True, how="outer")

Unnamed: 0_level_0,Shares_x,Low_x,High_x,Shares_y,Low_y,High_y
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [9]:
stock_merge = (stocks_2016
                   .merge(stocks_2017, left_index=True, right_index=True, how="outer",
                          suffixes=("_2016", "_2017"))
                   .merge(stocks_2018.add_suffix("_2018"), left_index=True, right_index=True,
                          how="outer")
              )
stock_merge

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
AMZN,,,,,,,8.0,900.0,1125.0
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
TXN,,,,500.0,15.0,23.0,,,
WMT,40.0,55.0,70.0,,,,,,


In [10]:
stock_join = stocks_2016.add_suffix("_2016").join(other, how="outer")

stock_concat = (pd.concat(
    dict(zip(years, stock_tables)), axis="columns")
    .swaplevel(axis=1)
    .pipe(lambda df_: df_.set_axis(df_.columns.to_flat_index(), axis=1))
    .rename(lambda label: "_".join([str(x) for x in label]), axis=1)
)
stock_join.equals(stock_concat)

True

In [11]:
stock_concat.sort_index().equals(stock_merge)

True

In [13]:
names = ["prices", "transactions"]
food_tables = [pd.read_csv("../python_cookbook/data/food_{}.csv".format(name)) for name in names]
food_prices, food_transactions = food_tables

In [15]:
food_prices

Unnamed: 0,item,store,price,Date
0,pear,A,0.99,2017
1,pear,B,1.99,2017
2,peach,A,2.99,2017
3,peach,B,3.49,2017
4,banana,A,0.39,2017
5,banana,B,0.49,2017
6,steak,A,5.99,2017
7,steak,B,6.99,2017
8,steak,B,4.99,2015


In [14]:
food_transactions

Unnamed: 0,custid,item,store,quantity
0,1,pear,A,5
1,1,banana,A,10
2,2,steak,B,3
3,2,pear,B,1
4,2,peach,B,2
5,2,steak,B,1
6,2,coconut,B,4


In [18]:
food_transactions.merge(food_prices, on=["item", "store"])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017
1,1,banana,A,10,0.39,2017
2,2,steak,B,3,6.99,2017
3,2,steak,B,3,4.99,2015
4,2,steak,B,1,6.99,2017
5,2,steak,B,1,4.99,2015
6,2,pear,B,1,1.99,2017
7,2,peach,B,2,3.49,2017


In [21]:
food_transactions.merge(food_prices.query("Date == 2017"), how="left")

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [22]:
food_prices_join = food_prices.query("Date == 2017").set_index(["item", "store"])
food_prices_join

Unnamed: 0_level_0,Unnamed: 1_level_0,price,Date
item,store,Unnamed: 2_level_1,Unnamed: 3_level_1
pear,A,0.99,2017
pear,B,1.99,2017
peach,A,2.99,2017
peach,B,3.49,2017
banana,A,0.39,2017
banana,B,0.49,2017
steak,A,5.99,2017
steak,B,6.99,2017


In [26]:
food_transactions.join(food_prices_join, on=["item", "store"])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [29]:
import glob

df_list = []
for filename in glob.glob("../python_cookbook/data/gas prices/*.csv"):
    df_list.append(pd.read_csv(filename, index_col="Week", parse_dates=["Week"]))
gas = pd.concat(df_list, axis="columns")
gas

Unnamed: 0_level_0,All Grades,Diesel,Midgrade,Premium,Regular
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-25,2.701,2.788,2.859,3.105,2.583
2017-09-18,2.750,2.791,2.906,3.151,2.634
2017-09-11,2.800,2.802,2.953,3.197,2.685
2017-09-04,2.794,2.758,2.946,3.191,2.679
2017-08-28,2.513,2.605,2.668,2.901,2.399
...,...,...,...,...,...
2007-01-29,2.213,2.413,2.277,2.381,2.165
2007-01-22,2.216,2.430,2.285,2.391,2.165
2007-01-15,2.280,2.463,2.347,2.453,2.229
2007-01-08,2.354,2.537,2.418,2.523,2.306
