## 4.6 MERGING AND EXPORTING DATA

### This Script contains the following points:

#### 1. Create dictionaries of data (to experiment with)
#### 2. Convert the dictionaries into dataframes
#### 3. Concatenate the dataframes
#### 4. Merging the Data
#### 5. Indicator Argument to Verify Merge
#### 6. Test Merge without Overwriting
#### 7. Merging InstaCart Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

#### 1. Create dictionaries of data (to experiment with)

In [2]:
# Define a dictionary containing January 2020 data
data1 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'purchased_meat':[0, 13, 3, 4],
    'purchased_alcohol':[1, 2, 10, 0],
    'purchased_snacks':[10, 5, 1, 7]}

In [3]:
# Define a dictionary containing February 2020 data
data2 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'],
    'purchased_meat':[0, 10, 5, 3],
    'purchased_alcohol':[2, 4, 14, 0],
    'purchased_snacks':[15, 3, 2, 6]}

#### 2. Convert the dictionaries into dataframes

In [4]:
#Convert the dictionary into dataframe
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
df_1 = pd.DataFrame(data2,index=[0, 1, 2, 3])

In [5]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [6]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 3. Concatinate the dataframes

In [7]:
# Create list that contains our new dataframes

frames = [df, df_1]

In [8]:
# Check output

frames

[  customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Jan-20               0                  1                10
 1         767  Jan-20              13                  2                 5
 2         890  Jan-20               3                 10                 1
 3         635  Jan-20               4                  0                 7,
   customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Feb-20               0                  2                15
 1         767  Feb-20              10                  4                 3
 2         890  Feb-20               5                 14                 2
 3         635  Feb-20               3                  0                 6]

In [9]:
#Verify Data Type as "List"

type(frames)

list

In [10]:
#Concatinate the Dataframes using Default Options

df_concat = pd.concat(frames)

In [11]:
#Check Output

df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [12]:
#Concatinate using Wide Frame AKA axis = 1 argument

df_concat = pd.concat(frames, axis = 1)

In [13]:
#Check Output

df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,customer_id.1,month.1,purchased_meat.1,purchased_alcohol.1,purchased_snacks.1
0,6732,Jan-20,0,1,10,6732,Feb-20,0,2,15
1,767,Jan-20,13,2,5,767,Feb-20,10,4,3
2,890,Jan-20,3,10,1,890,Feb-20,5,14,2
3,635,Jan-20,4,0,7,635,Feb-20,3,0,6


#### 4. Merging the Data

In [14]:
#Create Data with different columns from df and df_1

data3 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'days_purchased_on':[0, 10, 4, 1]}

In [15]:
#Convert to Dataframe

df_2 = pd.DataFrame(data3,index=[0, 1, 2, 3])

In [16]:
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,10
2,890,Jan-20,4
3,635,Jan-20,1


In [17]:
#Create new Data Set combining df and df_2

df_merged = df.merge(df_2, on = 'customer_id')

In [18]:
df_merged

Unnamed: 0,customer_id,month_x,purchased_meat,purchased_alcohol,purchased_snacks,month_y,days_purchased_on
0,6732,Jan-20,0,1,10,Jan-20,0
1,767,Jan-20,13,2,5,Jan-20,10
2,890,Jan-20,3,10,1,Jan-20,4
3,635,Jan-20,4,0,7,Jan-20,1


In [19]:
#Incorporating month into the merge to eliminate month x and month y

df_merged = df.merge(df_2, on = ['customer_id', 'month'])

In [20]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,10
2,890,Jan-20,3,10,1,4
3,635,Jan-20,4,0,7,1


#### 5. Indicator Argument to Verify Merge

In [21]:
#Creating Indicator column to verify source of data from the recent merge

df_merged = df.merge(df_2, on = ['customer_id', 'month'], indicator = True)

In [22]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


In [23]:
#Further Check by runing value counts on the _merge column

df_merged['_merge'].value_counts()

_merge
both          4
left_only     0
right_only    0
Name: count, dtype: int64

#### 6. Test Merge without Overwriting

In [24]:
# Test merge without overwriting
pd.merge(df,df_2, on = ['customer_id', 'month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


#### 7. Merging InstaCart Data