## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

%load_ext autoreload
%autoreload 2

In [2]:
file_name_foods = 'data/restaurant/foods.csv'
file_name_customers = 'data/restaurant/customers.csv'
week1 = 'data/restaurant/week_1_sales.csv'
week2 = 'data/restaurant/week_2_sales.csv'

# Load the data
foods = pd.read_csv(file_name_foods)
customers = pd.read_csv(file_name_customers)
week1 = pd.read_csv(week1)
week2 = pd.read_csv(week2)

## 01 Concatenation

In [3]:
week1.head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [4]:
week2.head()

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7
2,495,10
3,189,5
4,267,3


In [6]:
week1.shape, week2.shape

((250, 2), (250, 2))

In [5]:
# Concatenate the two weeks
pd.concat([week1, week2])

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,783,10
246,556,10
247,547,9
248,252,9


In [None]:
# Concatenate the two weeks and ignore the index
pd.concat([week1, week2], ignore_index=True)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


In [9]:
# Concatenate the two weeks and add keys [week1, week2]
pd.concat([week1, week2], keys=['week1', 'week2']).sort_index()

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
week1,0,537,9
week1,1,97,4
week1,2,658,1
week1,3,202,2
week1,4,155,9
...,...,...,...
week2,245,783,10
week2,246,556,10
week2,247,547,9
week2,248,252,9


In [5]:
# Create 2 simple DataFrames with columns 'A' (1st df) and 'B' (2nd df)
df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'B': [4, 5, 6]})

# Concatenate the two DataFrames along columns
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [10]:
df1 = pd.DataFrame({'B': [1, 2], 'A': [3, 4]})
df2 = pd.DataFrame({'C': [5, 6], 'A': [7, 8]})

# sort=False (default)
result1 = pd.concat([df1, df2], sort=False)
print("sort=False:\n", result1)

# sort=True
result2 = pd.concat([df1, df2], sort=True)
print("\nsort=True:\n", result2)

sort=False:
      B  A    C
0  1.0  3  NaN
1  2.0  4  NaN
0  NaN  7  5.0
1  NaN  8  6.0

sort=True:
    A    B    C
0  3  1.0  NaN
1  4  2.0  NaN
0  7  NaN  5.0
1  8  NaN  6.0


## 02 Left join

In [10]:
week1.head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [17]:
week1['Food ID'].sort_values().unique(), foods['Food ID'].sort_values().unique()

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]))

In [19]:
# Remove all rows from week1 with Food ID = 10
week1_mod = week1[week1['Food ID'] != 10]

In [20]:
week1_mod['Food ID'].sort_values().unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [22]:
week1_mod.shape

(226, 2)

In [21]:
pd.merge(week1_mod, foods, on='Food ID', how='left')

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,537,9,Donut,0.99
1,97,4,Quesadilla,4.25
2,658,1,Sushi,3.99
3,202,2,Burrito,9.99
4,155,9,Donut,0.99
...,...,...,...,...
221,621,9,Donut,0.99
222,413,9,Donut,0.99
223,926,6,Pasta,13.99
224,134,3,Taco,2.99


In [23]:
pd.merge(week1_mod, foods, on='Food ID', how='right')

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,658.0,1,Sushi,3.99
1,600.0,1,Sushi,3.99
2,155.0,1,Sushi,3.99
3,341.0,1,Sushi,3.99
4,20.0,1,Sushi,3.99
...,...,...,...,...
222,271.0,9,Donut,0.99
223,380.0,9,Donut,0.99
224,621.0,9,Donut,0.99
225,413.0,9,Donut,0.99


### 03 `left_on` and `right_on` parameters

In [25]:
week2.head()

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7
2,495,10
3,189,5
4,267,3


In [26]:
customers.head()    

Unnamed: 0,ID,First Name,Last Name,Gender,Company,Occupation
0,1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
1,2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
2,3,Roger,Black,Male,Tagfeed,Account Executive
3,4,Steven,Evans,Male,Fatz,Registered Nurse
4,5,Judy,Morrison,Female,Demivee,Legal Assistant


In [28]:
# Merge week2 with customers on 'Customer ID' using left_on and right_on
pd.merge(week2, customers, left_on='Customer ID', right_on='ID', how='left').drop(columns='ID')

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,688,10,Carl,Williamson,Male,Thoughtmix,Graphic Designer
1,813,7,Johnny,Walker,Male,Kayveo,Developer II
2,495,10,Deborah,Little,Female,Babbleblab,VP Accounting
3,189,5,Roger,Gordon,Male,Skilith,Operator
4,267,3,Matthew,Wood,Male,Agimba,Product Engineer
...,...,...,...,...,...,...,...
245,783,10,Phyllis,Meyer,Female,Voolia,Information Systems Manager
246,556,10,Samuel,Bailey,Male,Oyoloo,Nurse
247,547,9,Tina,Watkins,Female,Thoughtstorm,Accountant II
248,252,9,Douglas,Powell,Male,Jetwire,Geologist IV


## 04 Inner join

In [39]:
# Merge week1 and week2 using inner join on Customer ID
weeks_inner = pd.merge(week1, week2, on='Customer ID', how='inner', suffixes=(' week1', ' week2'))

In [48]:
# Let's prove that the inner join has only the Customer IDs that are present in both weeks
set1 = set(week1['Customer ID'].unique().astype(int).tolist())
set2 = set(week2['Customer ID'].unique().astype(int).tolist())
intersection = set1.intersection(set2)
intersection = sorted(list(intersection))
print(f'Intersection: {intersection[:10]} length: {len(intersection)}')

keys = weeks_inner['Customer ID'].sort_values().unique()
keys = keys.astype(int).tolist()
print(f'Keys in weeks_inner: {keys[:10]} length: {len(keys)}')

# Check if the keys in weeks_inner are the same as the intersection
if keys == intersection:
    print("The keys in 'weeks_inner' are the SAME as the 'intersection' of week1 and week2.")
else:
    print("The keys in weeks_inner are NOT the same as the intersection of week1 and week2.")

Intersection: [21, 30, 45, 75, 77, 80, 101, 155, 163, 189] length: 46
Keys in weeks_inner: [21, 30, 45, 75, 77, 80, 101, 155, 163, 189] length: 46
The keys in 'weeks_inner' are the SAME as the 'intersection' of week1 and week2.


In [49]:
weeks_inner.head()

Unnamed: 0,Customer ID,Food ID week1,Food ID week2
0,537,9,5
1,155,9,3
2,503,5,8
3,503,5,9
4,155,1,3


In [51]:
# Check entries with Customer ID 537 in week1 and week2
week1[week1['Customer ID'] == 537], week2[week2['Customer ID'] == 537]

(   Customer ID  Food ID
 0          537        9,
     Customer ID  Food ID
 42          537        5)

In [53]:
# Check entries with Customer ID 155 in week1 and week2
week1[week1['Customer ID'] == 155], week2[week2['Customer ID'] == 155]

(    Customer ID  Food ID
 4           155        9
 17          155        1,
      Customer ID  Food ID
 208          155        3)

In [4]:
# Merge week1 and week2 using inner join on Customer ID and Food ID
weeks_inner_food = pd.merge(week1, week2, on=['Customer ID', 'Food ID'], how='inner', suffixes=(' week1', ' week2'))
weeks_inner_food

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,922,1
6,21,4
7,578,5
8,578,5


In [5]:
# Let's prove that the inner join has only the Customer IDs and Food IDs that are present in both weeks
set1 = set(week1[['Customer ID', 'Food ID']].apply(tuple, axis=1).unique().tolist())
set2 = set(week2[['Customer ID', 'Food ID']].apply(tuple, axis=1).unique().tolist())
intersection = set1.intersection(set2)
intersection = sorted(list(intersection))
print(f'Intersection: {intersection[:10]} length: {len(intersection)}')

keys = weeks_inner_food[['Customer ID', 'Food ID']].apply(tuple, axis=1).sort_values().unique()
keys = keys.tolist()
print(f'Keys in weeks_inner_food: {keys[:10]} length: {len(keys)}')

if keys == intersection:
    print("The keys in 'weeks_inner_food' are the SAME as the 'intersection' of week1 and week2.")

Intersection: [(21, 4), (233, 3), (304, 3), (540, 3), (578, 5), (922, 1), (937, 10)] length: 7
Keys in weeks_inner_food: [(21, 4), (233, 3), (304, 3), (540, 3), (578, 5), (922, 1), (937, 10)] length: 7
The keys in 'weeks_inner_food' are the SAME as the 'intersection' of week1 and week2.


In [12]:
week1[['Customer ID', 'Food ID']]

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,413,9
246,926,6
247,134,3
248,396,6


In [11]:
week1[['Customer ID', 'Food ID']].apply(tuple, axis='columns') 

0       (537, 9)
1        (97, 4)
2       (658, 1)
3       (202, 2)
4       (155, 9)
         ...    
245     (413, 9)
246     (926, 6)
247     (134, 3)
248     (396, 6)
249    (535, 10)
Length: 250, dtype: object

In [15]:
for idx, row in week1[['Customer ID', 'Food ID']].iterrows():
    print(row)  # This is a Series for each row
    t = tuple(row)  # Convert to tuple
    print(f'\n{t} type:{type(t)}')  # Convert to tuple
    break

Customer ID    537
Food ID          9
Name: 0, dtype: int64

(537, 9) type:<class 'tuple'>


## 05 Merging using an index

In [16]:
week1.head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [18]:
customers = customers.set_index('ID')

In [19]:
customers.head()

Unnamed: 0_level_0,First Name,Last Name,Gender,Company,Occupation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
3,Roger,Black,Male,Tagfeed,Account Executive
4,Steven,Evans,Male,Fatz,Registered Nurse
5,Judy,Morrison,Female,Demivee,Legal Assistant


In [20]:
pd.merge(week1, customers, left_on='Customer ID', right_index=True, how='left')

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,537,9,Cheryl,Carroll,Female,Zoombeat,Registered Nurse
1,97,4,Amanda,Watkins,Female,Ozu,Account Coordinator
2,658,1,Patrick,Webb,Male,Browsebug,Community Outreach Specialist
3,202,2,Louis,Campbell,Male,Rhynoodle,Account Representative III
4,155,9,Carolyn,Diaz,Female,Gigazoom,Database Administrator III
...,...,...,...,...,...,...,...
245,413,9,Diane,Bailey,Female,Wikibox,Technical Writer
246,926,6,Anne,Wagner,Female,Skyba,Legal Assistant
247,134,3,Diana,Hall,Female,Quinu,Financial Advisor
248,396,6,Juan,Romero,Male,Zoonder,Analyst Programmer


In [21]:
# The same result with join method
week1.join(customers, on='Customer ID', how='left')

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,537,9,Cheryl,Carroll,Female,Zoombeat,Registered Nurse
1,97,4,Amanda,Watkins,Female,Ozu,Account Coordinator
2,658,1,Patrick,Webb,Male,Browsebug,Community Outreach Specialist
3,202,2,Louis,Campbell,Male,Rhynoodle,Account Representative III
4,155,9,Carolyn,Diaz,Female,Gigazoom,Database Administrator III
...,...,...,...,...,...,...,...
245,413,9,Diane,Bailey,Female,Wikibox,Technical Writer
246,926,6,Anne,Wagner,Female,Skyba,Legal Assistant
247,134,3,Diana,Hall,Female,Quinu,Financial Advisor
248,396,6,Juan,Romero,Male,Zoonder,Analyst Programmer


## 06 Coding challenge

In [12]:
# 1 Concatenate the two weeks of sales data into one DataFrame. Assign the week1
# DataFrame a key of "Week 1" and the week2 DataFrame a key of "Week 2".
week = pd.concat([week1, week2], keys=['Week 1', 'Week 2'])
week

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
Week 1,0,537,9
Week 1,1,97,4
Week 1,2,658,1
Week 1,3,202,2
Week 1,4,155,9
...,...,...,...
Week 2,245,783,10
Week 2,246,556,10
Week 2,247,547,9
Week 2,248,252,9


In [None]:
# 2 Find the customers who ate at the restaurant both weeks using inner join on 'Customer ID'.
pd.merge(week1, week2, on='Customer ID', how='inner')

Unnamed: 0,Customer ID,Food ID_x,Food ID_y
0,537,9,5
1,155,9,3
2,503,5,8
3,503,5,9
4,155,1,3
...,...,...,...
57,945,5,4
58,343,3,5
59,343,3,2
60,343,3,7


In [14]:
# 3 Find the customers who ate at the restaurant both weeks and ordered the same
# item each week using inner join on 'Customer ID' and 'Food ID'
pd.merge(week1, week2, on=['Customer ID', 'Food ID'], how='inner')

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,922,1
6,21,4
7,578,5
8,578,5


In [19]:
# 4 Identify which customers came in only on Week 1 and only on Week 2 using outer join
outer = pd.merge(week1, week2, on='Customer ID', how='outer', indicator=True)
outer[outer['_merge'] != 'both']

Unnamed: 0,Customer ID,Food ID_x,Food ID_y,_merge
0,3,2.0,,left_only
1,8,,6.0,right_only
2,10,2.0,,left_only
3,13,,2.0,right_only
4,20,1.0,,left_only
...,...,...,...,...
449,985,5.0,,left_only
450,991,2.0,,left_only
451,994,,2.0,right_only
452,996,,10.0,right_only


In [None]:
# 5 Each row in the week1 DataFrame identifies a customer who purchased a food
# item. For each row, pull in the customer’s information from the customers
# DataFrame
