# Merge Examples

### pandas.merge - arguments
- left
- right
- how : Default inner - left, right, inner, outer, cross
- on : Common key
- left_on :  Key from left dataframe
- right_on : Key from right dataframe
- left_index : Dafault False - uses index from the left dataframe to join
- right_index : Default False
- sort : Default False, sorts on the key
- suffixes : Default ("_x", "_y")
- copy : default True
- indicator : default False : adds a column "_merge" to the dataframe with merge information
- validate : optional - checks if merge is 1:m, m:1, 1:1

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Reading the files

In [22]:
transaction = pd.read_excel("Merge.xlsx", sheet_name= 0)

transaction.head()

Unnamed: 0,CustomerID,Product,Date,Year_Month
0,3047,C,2013-09-10,2013_09
1,4782,C,2015-11-19,2015_11
2,6896,B,2019-08-09,2019_08
3,3014,D,2014-11-28,2014_11
4,1409,E,2015-07-02,2015_07


In [23]:
transaction['Date'] = transaction.Date.astype(str).apply(lambda x: x[:10])

In [4]:
transaction.head()

Unnamed: 0,CustomerID,Product,Date,Year_Month
0,3047,C,2013-09-10,2013_09
1,4782,C,2015-11-19,2015_11
2,6896,B,2019-08-09,2019_08
3,3014,D,2014-11-28,2014_11
4,1409,E,2015-07-02,2015_07


In [24]:
customer = pd.read_excel("Merge.xlsx", sheet_name= 1)

customer.head()

Unnamed: 0,CustomerID,Income,DOB
0,3047,1261089,1981-06-18
1,4782,1862612,1978-08-11
2,6896,1340472,1972-06-09
3,3014,1715940,1986-09-15
4,1409,449795,1985-03-01


In [25]:
customer['DOB'] = customer['DOB'].astype(str).apply(lambda x: x[:10])

In [26]:
customer.head()

Unnamed: 0,CustomerID,Income,DOB
0,3047,1261089,1981-06-18
1,4782,1862612,1978-08-11
2,6896,1340472,1972-06-09
3,3014,1715940,1986-09-15
4,1409,449795,1985-03-01


In [27]:
product = pd.read_excel("Merge.xlsx", sheet_name= 2)

product.head()

Unnamed: 0,Product,Price
0,A,25
1,B,65
2,C,45
3,D,39
4,E,71


In [28]:
product_season = pd.read_excel("Merge.xlsx", sheet_name= 3)

product_season.head()

Unnamed: 0,Product,Price,Discount,Year_Month
0,A,25,0.16,2019_01
1,B,65,0.2,2019_02
2,C,45,0.28,2019_03
3,D,39,0.29,2019_04
4,E,71,0.1,2019_05


## Question 1 :  Get the age of the customer in the transaction data

In [29]:
transaction_age = pd.merge(transaction, customer, on = 'CustomerID', how = 'left')

transaction_age.head()

Unnamed: 0,CustomerID,Product,Date,Year_Month,Income,DOB
0,3047,C,2013-09-10,2013_09,1261089,1981-06-18
1,4782,C,2015-11-19,2015_11,1862612,1978-08-11
2,6896,B,2019-08-09,2019_08,1340472,1972-06-09
3,3014,D,2014-11-28,2014_11,1715940,1986-09-15
4,1409,E,2015-07-02,2015_07,449795,1985-03-01


In [30]:
transaction_age.dtypes

CustomerID     int64
Product       object
Date          object
Year_Month    object
Income         int64
DOB           object
dtype: object

In [31]:
# Age at the time of purchase
transaction_age['Age'] = round((pd.to_datetime(transaction_age['Date']) - pd.to_datetime(transaction_age['DOB'])).dt.days/365,1)

In [32]:
transaction_age.head()

Unnamed: 0,CustomerID,Product,Date,Year_Month,Income,DOB,Age
0,3047,C,2013-09-10,2013_09,1261089,1981-06-18,32.3
1,4782,C,2015-11-19,2015_11,1862612,1978-08-11,37.3
2,6896,B,2019-08-09,2019_08,1340472,1972-06-09,47.2
3,3014,D,2014-11-28,2014_11,1715940,1986-09-15,28.2
4,1409,E,2015-07-02,2015_07,449795,1985-03-01,30.4


## Question 2 :  Get the price of the product in the transaction data

In [33]:
transaction.shape

(181, 4)

In [34]:
transaction_age.shape

(181, 7)

In [35]:
transaction_age_price = pd.merge(transaction_age, product, on = 'Product', how = 'left')
transaction_age_price.head()

Unnamed: 0,CustomerID,Product,Date,Year_Month,Income,DOB,Age,Price
0,3047,C,2013-09-10,2013_09,1261089,1981-06-18,32.3,45
1,4782,C,2015-11-19,2015_11,1862612,1978-08-11,37.3,45
2,6896,B,2019-08-09,2019_08,1340472,1972-06-09,47.2,65
3,3014,D,2014-11-28,2014_11,1715940,1986-09-15,28.2,39
4,1409,E,2015-07-02,2015_07,449795,1985-03-01,30.4,71


In [36]:
transaction_age_price.shape

(181, 8)

## Question 3 : Add price and discount by season to the transaction data

### Left join to keep all the ransaction data

In [38]:
product_season['Product'].value_counts()

Product
B    12
C    12
D    12
A     7
E     5
Name: count, dtype: int64

In [39]:
transaction_ps = pd.merge(transaction, product_season, 
                          left_on= ['Product', 'Year_Month'],
                          right_on= ['Product', 'Year_Month'], how = 'left')

transaction_ps

Unnamed: 0,CustomerID,Product,Date,Year_Month,Price,Discount
0,3047,C,2013-09-10,2013_09,,
1,4782,C,2015-11-19,2015_11,,
2,6896,B,2019-08-09,2019_08,,
3,3014,D,2014-11-28,2014_11,,
4,1409,E,2015-07-02,2015_07,,
...,...,...,...,...,...,...
176,4631,E,2019-05-05,2019_05,71.0,0.1
177,8686,E,2015-02-07,2015_02,,
178,72,C,2016-08-11,2016_08,,
179,8012,D,2017-03-08,2017_03,,


### Inner join to keep only common records with Product and Year_Month as key

In [40]:
transaction_ps = pd.merge(transaction, product_season, 
                          left_on= ['Product', 'Year_Month'],
                          right_on= ['Product', 'Year_Month'], how = 'inner')

transaction_ps

Unnamed: 0,CustomerID,Product,Date,Year_Month,Price,Discount
0,5592,B,2019-10-05,2019_10,48,0.15
1,1560,D,2021-05-26,2021_05,51,0.3
2,7279,D,2020-06-01,2020_06,50,0.14
3,2204,C,2020-12-11,2020_12,49,0.23
4,9185,A,2020-10-22,2020_10,46,0.29
5,3237,D,2019-08-17,2019_08,50,0.12
6,8537,E,2019-05-02,2019_05,71,0.1
7,187,A,2021-08-03,2021_08,40,0.3
8,2204,E,2020-03-25,2020_03,32,0.11
9,4631,E,2019-05-05,2019_05,71,0.1


## As most of the prices are missing, let's add regular price for such cases

In [41]:
transaction_ps = pd.merge(transaction, product_season, 
                          left_on= ['Product', 'Year_Month'],
                          right_on= ['Product', 'Year_Month'], how = 'left')

In [44]:
transaction_ps

Unnamed: 0,CustomerID,Product,Date,Year_Month,Price,Discount
0,3047,C,2013-09-10,2013_09,,
1,4782,C,2015-11-19,2015_11,,
2,6896,B,2019-08-09,2019_08,,
3,3014,D,2014-11-28,2014_11,,
4,1409,E,2015-07-02,2015_07,,
...,...,...,...,...,...,...
176,4631,E,2019-05-05,2019_05,71.0,0.1
177,8686,E,2015-02-07,2015_02,,
178,72,C,2016-08-11,2016_08,,
179,8012,D,2017-03-08,2017_03,,


### Using suffixes to identify the columns from different tables

In [46]:
transaction_ps2 = pd.merge(transaction_ps, product, 
                          on = 'Product', how = 'left', suffixes= ("", "_p"))
transaction_ps2

Unnamed: 0,CustomerID,Product,Date,Year_Month,Price,Discount,Price_p
0,3047,C,2013-09-10,2013_09,,,45
1,4782,C,2015-11-19,2015_11,,,45
2,6896,B,2019-08-09,2019_08,,,65
3,3014,D,2014-11-28,2014_11,,,39
4,1409,E,2015-07-02,2015_07,,,71
...,...,...,...,...,...,...,...
176,4631,E,2019-05-05,2019_05,71.0,0.1,71
177,8686,E,2015-02-07,2015_02,,,71
178,72,C,2016-08-11,2016_08,,,45
179,8012,D,2017-03-08,2017_03,,,39


In [47]:
transaction_ps2['Price'] = transaction_ps2.apply(
    lambda row: row['Price_p'] if pd.isna(row['Price']) else row['Price'], axis=1
)

In [48]:
transaction_ps2.fillna(0)

Unnamed: 0,CustomerID,Product,Date,Year_Month,Price,Discount,Price_p
0,3047,C,2013-09-10,2013_09,45.0,0.0,45
1,4782,C,2015-11-19,2015_11,45.0,0.0,45
2,6896,B,2019-08-09,2019_08,65.0,0.0,65
3,3014,D,2014-11-28,2014_11,39.0,0.0,39
4,1409,E,2015-07-02,2015_07,71.0,0.0,71
...,...,...,...,...,...,...,...
176,4631,E,2019-05-05,2019_05,71.0,0.1,71
177,8686,E,2015-02-07,2015_02,71.0,0.0,71
178,72,C,2016-08-11,2016_08,45.0,0.0,45
179,8012,D,2017-03-08,2017_03,39.0,0.0,39


# Revenue by year month and count of customer

In [49]:
transaction_ps2.groupby(['Year_Month']).agg(customer_count = ('CustomerID', 'count'),
                                         Revenue = ('Price', 'sum'))

Unnamed: 0_level_0,customer_count,Revenue
Year_Month,Unnamed: 1_level_1,Unnamed: 2_level_1
2012_05,1,39.0
2012_07,3,143.0
2012_09,1,71.0
2012_10,1,39.0
2012_12,2,116.0
...,...,...
2023_12,1,65.0
2024_02,1,39.0
2024_04,3,161.0
2024_06,2,84.0


# Revenue by product and count of customer

In [50]:
transaction_ps2.groupby(['Product']).agg(customer_count = ('CustomerID', 'count'),
                                         Revenue = ('Price', 'sum'))

Unnamed: 0_level_0,customer_count,Revenue
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
A,40,1036.0
B,35,2258.0
C,32,1444.0
D,30,1204.0
E,44,3085.0
