<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_merge_join_concat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas merging, joining, and concatenating
- combine 2 or more dataframes
- in Python Pandas:
  - join is only based on index
  - merge is more flexible and can combine using index or columns

In [9]:
# libraries needed
import numpy as np
import pandas as pd

In [10]:
# datasets we'll need
week1_sales = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/Restaurant%20-%20Week%201%20Sales.csv'
)

week2_sales = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/Restaurant%20-%20Week%202%20Sales.csv'
)

customers = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/Restaurant%20-%20Customers.csv'
)

foods = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/Restaurant%20-%20Foods.csv'
)

In [11]:
# examine data

# week1_sales is one row per order: which customer ordered which food in week 1
week1_sales.head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [12]:
# week2_sales is one row per order: which customer ordered which food in week 2
week2_sales.head()

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7
2,495,10
3,189,5
4,267,3


In [13]:
# one row per restaurant customer; 'ID' column matches to 'Customer ID' column in week1_sales and week2_sales
customers.head()

Unnamed: 0,ID,First Name,Last Name,Gender,Company,Occupation
0,1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
1,2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
2,3,Roger,Black,Male,Tagfeed,Account Executive
3,4,Steven,Evans,Male,Fatz,Registered Nurse
4,5,Judy,Morrison,Female,Demivee,Legal Assistant


In [14]:
# foods is one row per food
foods.head()

Unnamed: 0,Food ID,Food Item,Price
0,1,Sushi,3.99
1,2,Burrito,9.99
2,3,Taco,2.99
3,4,Quesadilla,4.25
4,5,Pizza,2.49


## pd.concat() method
- stack dataframes on top of each other
- like SQL UNION ALL
- syntax is pd.concat([df1, df2], ignore_index = True or False)
  - ignore_index = False keeps the same indices
  - ignore_index = True resets the index, i.e., a fresh one
- alternative syntax is df1.append(df2, ignore_index = True or False)

In [15]:
# week1_sales and week2_sales have same structure; stack on top of each other
pd.concat(
    objs = [week1_sales, week2_sales]     # week1_sales on top; week2_sales below
)

# note that there are 500 rows, but index is from 0-249
# this is because there are duplicated index values: 0-249 for week1_sales and 0-249 for week2_sales

# if you want an index from 0-499:
  # ignore_index = True

# see below for how this works

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,783,10
246,556,10
247,547,9
248,252,9


In [22]:
fake_dat_a = pd.DataFrame(data = [1, 2, 3], index = [0, 1, 2])
fake_dat_b = pd.DataFrame(data = [4, 5, 6], index = ['a', 'b', 'c'])

pd.concat(
    objs = [fake_dat_a, fake_dat_b],
    ignore_index = False              # False by default, which preserves original index in each dataframe; True resets the index
)

Unnamed: 0,0
0,1
1,2
2,3
a,4
b,5
c,6


In [23]:
# new sales dataframe with combines week1_sales and week2_sales
pd.concat(
    objs = [week1_sales, week2_sales],
    ignore_index = True                    # re-index, i.e., a new index
)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


In [24]:
# above is equivalent to:
(
    week1_sales
      .append(week2_sales, ignore_index = True)
)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


In [33]:
# multi-index such that we have unique indices and we can identify whether row came from week1_sales or week2_sales
sales = pd.concat(
    objs = [week1_sales, week2_sales],
    keys = ['week1', 'week2']                  # don't need to add indicator column manually
)

sales

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
week1,0,537,9
week1,1,97,4
week1,2,658,1
week1,3,202,2
week1,4,155,9
...,...,...,...
week2,245,783,10
week2,246,556,10
week2,247,547,9
week2,248,252,9
