This exercise was done based on https://towardsdatascience.com/pandas-join-vs-merge-c365fd4fbf49

In [2]:
import pandas as pd
import numpy as np

In [32]:
sales = {'Tony':103, 'Sally':202,'Randy':380,'Ellen':101,'Fred':82}

In [28]:
region = {'Tony':'West',
          'Sally':'South',
          'Carl':'West',
          'Archie':'North',
          'Randy':'East',
          'Ellen':'South',
          'Fred': np.nan,
          'Mo':'East',
          'HanWei':np.nan,
         }

In [33]:
# Make dataframe, orient = 'index' to make the rows the keys
sales_df = pd.DataFrame.from_dict(sales, orient='index', columns=['sales'])
sales_df

Unnamed: 0,sales
Tony,103
Sally,202
Randy,380
Ellen,101
Fred,82


In [29]:
region_df = pd.DataFrame.from_dict(region, orient='index', columns=['Region'])
region_df

Unnamed: 0,Region
Tony,West
Sally,South
Carl,West
Archie,North
Randy,East
Ellen,South
Fred,
Mo,East
HanWei,


In [23]:
# See index of a dataframe
sales_df.index

Index(['Tony', 'Sally', 'Randy', 'Ellen', 'Fred'], dtype='object')

In [24]:
region_df.index

Index(['Tony', 'Sally', 'Carl', 'Archie', 'Randy', 'Ellen', 'Fred', 'Mo',
       'HanWei'],
      dtype='object')

In [34]:
# Find data using index
sales_df.loc['Tony']

sales    103
Name: Tony, dtype: int64

### Join method

In [37]:
# Join the 2 dataframe using left join
joined_df = region_df.join(sales_df, how='left')
joined_df

Unnamed: 0,Region,sales
Tony,West,103.0
Sally,South,202.0
Carl,West,
Archie,North,
Randy,East,380.0
Ellen,South,101.0
Fred,,82.0
Mo,East,
HanWei,,


In [38]:
# Inner join
region_df.join(sales_df,how='inner')

Unnamed: 0,Region,sales
Tony,West,103
Sally,South,202
Randy,East,380
Ellen,South,101
Fred,,82


### Merge method

In [40]:
# Merge the two tables on their indexes
joined_df_merge = region_df.merge(sales_df,how='left',left_index=True,right_index=True)
joined_df_merge

Unnamed: 0,Region,sales
Tony,West,103.0
Sally,South,202.0
Carl,West,
Archie,North,
Randy,East,380.0
Ellen,South,101.0
Fred,,82.0
Mo,East,
HanWei,,


In [46]:
grouped_df = joined_df_merge.groupby(by='Region').mean()
grouped_df.reset_index(inplace=True)
grouped_df

Unnamed: 0,Region,sales
0,East,380.0
1,North,
2,South,151.5
3,West,103.0


In [47]:
grouped_df = joined_df_merge.groupby(by='Region').sum()
grouped_df.reset_index(inplace=True)
grouped_df

Unnamed: 0,Region,sales
0,East,380.0
1,North,0.0
2,South,303.0
3,West,103.0


In [49]:
employee_contrib = joined_df_merge.merge(grouped_df, how='left', 
                                         left_on='Region', right_on='Region',
                                         suffixes=('','_region'))
employee_contrib

Unnamed: 0,Region,sales,sales_region
0,West,103.0,103.0
1,South,202.0,303.0
2,West,,103.0
3,North,,0.0
4,East,380.0,380.0
5,South,101.0,303.0
6,,82.0,
7,East,,380.0
8,,,


In [51]:
# The index is gone, so we will reset the index
employee_contrib = employee_contrib.set_index(joined_df_merge.index)
employee_contrib

Unnamed: 0,Region,sales,sales_region
Tony,West,103.0,103.0
Sally,South,202.0,303.0
Carl,West,,103.0
Archie,North,,0.0
Randy,East,380.0,380.0
Ellen,South,101.0,303.0
Fred,,82.0,
Mo,East,,380.0
HanWei,,,


In [52]:
# Clean up the data
# Drop NA in Region column
employee_contrib = employee_contrib.dropna(subset=['Region'])
employee_contrib

Unnamed: 0,Region,sales,sales_region
Tony,West,103.0,103.0
Sally,South,202.0,303.0
Carl,West,,103.0
Archie,North,,0.0
Randy,East,380.0,380.0
Ellen,South,101.0,303.0
Mo,East,,380.0


In [55]:
# Fill 0 in the sales column if it is NaN
employee_contrib = employee_contrib.fillna({'sales':0})
employee_contrib

Unnamed: 0,Region,sales,sales_region
Tony,West,103.0,103.0
Sally,South,202.0,303.0
Carl,West,0.0,103.0
Archie,North,0.0,0.0
Randy,East,380.0,380.0
Ellen,South,101.0,303.0
Mo,East,0.0,380.0


In [57]:
# Calculate the percentage of employee contribution to the total sales
employee_contrib['% of sales'] = employee_contrib['sales']/employee_contrib['sales_region'] * 100
employee_contrib

Unnamed: 0,Region,sales,sales_region,% of sales
Tony,West,103.0,103.0,100.0
Sally,South,202.0,303.0,66.666667
Carl,West,0.0,103.0,0.0
Archie,North,0.0,0.0,
Randy,East,380.0,380.0,100.0
Ellen,South,101.0,303.0,33.333333
Mo,East,0.0,380.0,0.0


In [58]:
print(employee_contrib[['Region','sales','% of sales']].sort_values(by=['Region','% of sales']))

       Region  sales  % of sales
Mo       East    0.0    0.000000
Randy    East  380.0  100.000000
Archie  North    0.0         NaN
Ellen   South  101.0   33.333333
Sally   South  202.0   66.666667
Carl     West    0.0    0.000000
Tony     West  103.0  100.000000
