In [1]:
# Importing libs, reading the data
import numpy as np
import pandas as pd
retail = pd.read_csv('resources/online_retail2.csv')

In [2]:
retail.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
# how many rows and columns?
retail.shape

(1067371, 8)

In [4]:
# removing duplicates
retail = retail.drop_duplicates()

In [5]:
# after removing duplicates
retail.shape

(1033036, 8)

In [6]:
# checking null values
retail.Description.isnull()

0          False
1          False
2          False
3          False
4          False
           ...  
1067366    False
1067367    False
1067368    False
1067369    False
1067370    False
Name: Description, Length: 1033036, dtype: bool

In [7]:
# total null values
retail.Description.isnull().sum()

4275

In [8]:
# entries that are null inside rows
retail.isnull().sum().sum()

239426

In [9]:
# dropping nulls, axis = 0 means drop na from rows
retail = retail.dropna(axis = 0, how = 'any')

In [10]:
retail.shape

(797885, 8)

In [11]:
# checking column types
retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797885 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      797885 non-null  object 
 1   StockCode    797885 non-null  object 
 2   Description  797885 non-null  object 
 3   Quantity     797885 non-null  int64  
 4   InvoiceDate  797885 non-null  object 
 5   Price        797885 non-null  float64
 6   Customer ID  797885 non-null  float64
 7   Country      797885 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 54.8+ MB


In [12]:
# looking at InvoceDate column
retail.InvoiceDate

0          2009-12-01 07:45:00
1          2009-12-01 07:45:00
2          2009-12-01 07:45:00
3          2009-12-01 07:45:00
4          2009-12-01 07:45:00
                  ...         
1067366    2011-12-09 12:50:00
1067367    2011-12-09 12:50:00
1067368    2011-12-09 12:50:00
1067369    2011-12-09 12:50:00
1067370    2011-12-09 12:50:00
Name: InvoiceDate, Length: 797885, dtype: object

In [13]:
# converting InvoiceDate to correct type (datetime)
retail['date'] = pd.to_datetime(retail['InvoiceDate'])

In [14]:
# confirming it worked
retail['date']

0         2009-12-01 07:45:00
1         2009-12-01 07:45:00
2         2009-12-01 07:45:00
3         2009-12-01 07:45:00
4         2009-12-01 07:45:00
                  ...        
1067366   2011-12-09 12:50:00
1067367   2011-12-09 12:50:00
1067368   2011-12-09 12:50:00
1067369   2011-12-09 12:50:00
1067370   2011-12-09 12:50:00
Name: date, Length: 797885, dtype: datetime64[ns]

In [15]:
# investigating further
retail['date'].describe(datetime_is_numeric=True)

count                           797885
mean     2011-01-02 13:17:34.141160704
min                2009-12-01 07:45:00
25%                2010-07-02 09:47:00
50%                2010-12-02 12:33:00
75%                2011-07-31 15:50:00
max                2011-12-09 12:50:00
Name: date, dtype: object

In [16]:
# getting counts of every Country
retail.Country.value_counts()

# fitlering for one specific country
retail[retail.Country == 'France']

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
71,489439,22065,CHRISTMAS PUDDING TRINKET POT,12,2009-12-01 09:28:00,1.45,12682.0,France,2009-12-01 09:28:00
72,489439,22138,BAKING SET 9 PIECE RETROSPOT,9,2009-12-01 09:28:00,4.95,12682.0,France,2009-12-01 09:28:00
73,489439,22139,RETRO SPOT TEA SET CERAMIC 11 PC,9,2009-12-01 09:28:00,4.95,12682.0,France,2009-12-01 09:28:00
74,489439,22352,LUNCHBOX WITH CUTLERY RETROSPOT,12,2009-12-01 09:28:00,2.55,12682.0,France,2009-12-01 09:28:00
75,489439,85014A,BLACK/BLUE DOTS RUFFLED UMBRELLA,3,2009-12-01 09:28:00,5.95,12682.0,France,2009-12-01 09:28:00
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,2011-12-09 12:50:00
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,2011-12-09 12:50:00


In [17]:
# multiple conditions filtering
retail[(retail.Country == 'France') | (retail.Country == 'EIRE')].Country.value_counts()

EIRE      16014
France    13897
Name: Country, dtype: int64

In [18]:
# creating a list for filtering
countries = ['France', 'EIRE', 'Spain']

In [19]:
retail[retail.Country.isin(countries)].Country.value_counts()

EIRE      16014
France    13897
Spain      3754
Name: Country, dtype: int64

In [20]:
# filtering for dates
retail[retail.date >= '2011-Aug']

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
810882,561904,22075,6 RIBBONS ELEGANT CHRISTMAS,96,2011-08-01 08:30:00,1.45,17941.0,United Kingdom,2011-08-01 08:30:00
810883,561904,85049E,SCANDINAVIAN REDS RIBBONS,156,2011-08-01 08:30:00,1.06,17941.0,United Kingdom,2011-08-01 08:30:00
810884,561905,21385,IVORY HANGING DECORATION HEART,24,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
810885,561905,84970L,SINGLE HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.95,14947.0,United Kingdom,2011-08-01 09:31:00
810886,561905,84970S,HANGING HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,2011-12-09 12:50:00
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,2011-12-09 12:50:00


In [21]:
retail[(retail.date >= '2011-Aug-01') & (retail.date >= '2011-Sep-01')]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
846166,565080,20677,PINK POLKADOT BOWL,8,2011-09-01 08:25:00,1.25,13509.0,United Kingdom,2011-09-01 08:25:00
846167,565080,22128,PARTY CONES CANDY ASSORTED,24,2011-09-01 08:25:00,1.25,13509.0,United Kingdom,2011-09-01 08:25:00
846169,565082,22423,REGENCY CAKESTAND 3 TIER,2,2011-09-01 09:15:00,12.75,13305.0,United Kingdom,2011-09-01 09:15:00
846170,565082,15060B,FAIRY CAKE DESIGN UMBRELLA,8,2011-09-01 09:15:00,3.75,13305.0,United Kingdom,2011-09-01 09:15:00
846171,565082,23245,SET OF 3 REGENCY CAKE TINS,4,2011-09-01 09:15:00,4.95,13305.0,United Kingdom,2011-09-01 09:15:00
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,2011-12-09 12:50:00
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,2011-12-09 12:50:00


In [22]:
# Replacing values in the 'Country' and 'StockCode' columns to make specific changes
# 'EIRE' is replaced with 'Eastern Ireland' in the 'Country' column,
# and 'POST' is replaced with 'post' in the 'StockCode' column.
retail['Country'][retail.Country == 'EIRE'] = 'Eastern Ireland'
retail['StockCode'][retail.StockCode == 'POST'] = 'post'

In [23]:
retail[retail.StockCode == 'post']

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
89,489439,post,POSTAGE,3,2009-12-01 09:28:00,18.00,12682.0,France,2009-12-01 09:28:00
126,489444,post,POSTAGE,1,2009-12-01 09:55:00,141.00,12636.0,USA,2009-12-01 09:55:00
173,489447,post,POSTAGE,1,2009-12-01 10:10:00,130.00,12362.0,Belgium,2009-12-01 10:10:00
625,489526,post,POSTAGE,6,2009-12-01 11:50:00,18.00,12533.0,Germany,2009-12-01 11:50:00
927,C489538,post,POSTAGE,-1,2009-12-01 12:18:00,9.58,15796.0,United Kingdom,2009-12-01 12:18:00
...,...,...,...,...,...,...,...,...,...
1066677,581494,post,POSTAGE,2,2011-12-09 10:13:00,18.00,12518.0,Germany,2011-12-09 10:13:00
1067191,581570,post,POSTAGE,1,2011-12-09 11:59:00,18.00,12662.0,Germany,2011-12-09 11:59:00
1067228,581574,post,POSTAGE,2,2011-12-09 12:09:00,18.00,12526.0,Germany,2011-12-09 12:09:00
1067229,581578,post,POSTAGE,3,2011-12-09 12:16:00,18.00,12713.0,Germany,2011-12-09 12:16:00


In [24]:
# Use .loc to slice based on datetime index for the year 2011
# retail_2011 = retail.loc['2011']
# retail_2011.head()

In [25]:
retail = retail.reset_index()

In [26]:
# setting multiple index for slicing the data further
retail = retail.set_index(['Country', 'date'])

In [27]:
# sorting by indexes ascending
retail_sorted = retail.sort_index()
retail_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID
Country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,2009-12-01 10:33:00,178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01 10:33:00,2.95,16321.0
Australia,2009-12-01 10:33:00,179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01 10:33:00,1.65,16321.0
Australia,2009-12-01 10:33:00,180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01 10:33:00,4.25,16321.0
Australia,2009-12-01 10:33:00,181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01 10:33:00,2.10,16321.0
Australia,2009-12-01 10:33:00,182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01 10:33:00,2.95,16321.0
...,...,...,...,...,...,...,...,...,...
West Indies,2010-08-23 11:58:00,314701,520018,20733,GOLD MINI TAPE MEASURE,3,2010-08-23 11:58:00,0.85,18140.0
West Indies,2010-08-23 11:58:00,314702,520018,20734,SILVER MINI TAPE MEASURE,3,2010-08-23 11:58:00,0.85,18140.0
West Indies,2010-08-23 11:58:00,314703,520018,20702,PINK PADDED MOBILE,2,2010-08-23 11:58:00,4.25,18140.0
West Indies,2010-08-23 11:58:00,314704,520018,21678,PAISLEY PATTERN STICKERS,6,2010-08-23 11:58:00,0.85,18140.0


In [28]:
# filtering further - remembering that there are two levels of indexes
retail_sorted.loc[('France', '2011'), ['Customer ID', 'Description']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer ID,Description
Country,date,Unnamed: 2_level_1,Unnamed: 3_level_1
France,2011-01-05 11:13:00,12494.0,RED RETROSPOT CAKE STAND
France,2011-01-05 11:36:00,12683.0,RED RETROSPOT CAKE STAND
France,2011-01-05 12:42:00,12681.0,GINGERBREAD MAN COOKIE CUTTER
France,2011-01-05 12:42:00,12681.0,CUTE CATS TAPE
France,2011-01-05 12:42:00,12681.0,CABIN BAG VINTAGE RETROSPOT
France,...,...,...
France,2011-12-09 12:50:00,12680.0,CHILDREN'S APRON DOLLY GIRL
France,2011-12-09 12:50:00,12680.0,CHILDRENS CUTLERY DOLLY GIRL
France,2011-12-09 12:50:00,12680.0,CHILDRENS CUTLERY CIRCUS PARADE
France,2011-12-09 12:50:00,12680.0,BAKING SET 9 PIECE RETROSPOT


In [29]:
### group by and aggregations
# count() - number of non-null observations
# sum() - sum of values
# mean() - mean of values
# median() - median of values
# min() - minimum
# max() - maximum
# mode() - mode
# std() - standard deviation
# var() - variance

In [30]:
retail = retail.reset_index()

In [31]:
retail.head()

Unnamed: 0,Country,date,index,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID
0,United Kingdom,2009-12-01 07:45:00,0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0
1,United Kingdom,2009-12-01 07:45:00,1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
2,United Kingdom,2009-12-01 07:45:00,2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
3,United Kingdom,2009-12-01 07:45:00,3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0
4,United Kingdom,2009-12-01 07:45:00,4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0


In [32]:
retail.groupby('Country').head()

Unnamed: 0,Country,date,index,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID
0,United Kingdom,2009-12-01 07:45:00,0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0
1,United Kingdom,2009-12-01 07:45:00,1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
2,United Kingdom,2009-12-01 07:45:00,2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
3,United Kingdom,2009-12-01 07:45:00,3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0
4,United Kingdom,2009-12-01 07:45:00,4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0
...,...,...,...,...,...,...,...,...,...,...
511759,European Community,2011-04-26 10:54:00,693610,551013,22839,3 TIER CAKE TIN GREEN AND CREAM,1,2011-04-26 10:54:00,14.95,15108.0
511760,European Community,2011-04-26 10:54:00,693611,551013,22840,ROUND CAKE TIN VINTAGE RED,2,2011-04-26 10:54:00,7.95,15108.0
511761,European Community,2011-04-26 10:54:00,693612,551013,22841,ROUND CAKE TIN VINTAGE GREEN,2,2011-04-26 10:54:00,7.95,15108.0
511762,European Community,2011-04-26 10:54:00,693613,551013,22457,NATURAL SLATE HEART CHALKBOARD,6,2011-04-26 10:54:00,2.95,15108.0


In [33]:
# calculating total sales for each country
retail.groupby('Country')['Quantity'].sum()

Country
Australia                103375
Austria                   11306
Bahrain                     755
Belgium                   34598
Brazil                      545
Canada                     3657
Channel Islands           20387
Cyprus                    10652
Czech Republic              592
Denmark                  234764
Eastern Ireland          309717
European Community          497
Finland                   14317
France                   179959
Germany                  221816
Greece                     7707
Iceland                    2967
Israel                     5119
Italy                     15122
Japan                     30138
Korea                       598
Lebanon                     386
Lithuania                  2306
Malta                      2491
Netherlands              381853
Nigeria                     103
Norway                    23528
Poland                     5504
Portugal                  27072
RSA                         943
Saudi Arabia                 75


In [34]:
# calculating mean qty and price for each country and description
retail.groupby(['Country', 'Description'])[['Quantity', 'Price']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Price
Country,Description,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,DOLLY GIRL BEAKER,200.0,1.08
Australia,I LOVE LONDON MINI BACKPACK,4.0,4.15
Australia,10 COLOUR SPACEBOY PEN,48.0,0.85
Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,0.55
Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,0.65
...,...,...,...
West Indies,VINTAGE BEAD PINK SCARF,3.0,7.95
West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,1.25
West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.65
West Indies,WOVEN BERRIES CUSHION COVER,2.0,4.95


In [35]:
# multiple aggregations
retail_grouped = retail.groupby(['Country', 'Description'])[['Quantity', 'Price']].agg([np.mean, np.median])
retail_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median
Country,Description,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...
West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [36]:
retail_grouped = retail_grouped.sort_index()
retail_grouped.loc[('Australia', '10 COLOUR SPACEBOY PEN'), ('Price', 'median')]

0.85

In [37]:
## without indexing
retail_grouped_no_index = retail.groupby(['Country', 'Description'])[['Quantity', 'Price']].agg([np.mean, np.median]).reset_index()
retail_grouped_no_index

Unnamed: 0_level_0,Country,Description,Quantity,Quantity,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median
0,Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...,...
29509,West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
29510,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
29511,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
29512,West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [38]:
retail_grouped_no_index = retail_grouped_no_index.droplevel(axis = 1, level = 0)
retail_grouped_no_index

Unnamed: 0,Unnamed: 1,Unnamed: 2,mean,median,mean.1,median.1
0,Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...,...
29509,West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
29510,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
29511,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
29512,West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [39]:
retail.groupby(['Country', 'Description']).agg(mean_qty = ('Quantity', np.mean),
                                              mean_price = ('Price', np.mean),
                                              median_qty = ('Quantity', np.median),
                                              median_price = ('Price', np.median)).reset_index()

Unnamed: 0,Country,Description,mean_qty,mean_price,median_qty,median_price
0,Australia,DOLLY GIRL BEAKER,200.0,1.08,200.0,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.15,4.0,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,0.85,48.0,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,0.55,384.0,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,0.65,24.0,0.65
...,...,...,...,...,...,...
29509,West Indies,VINTAGE BEAD PINK SCARF,3.0,7.95,3.0,7.95
29510,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,1.25,6.0,1.25
29511,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.65,1.0,1.65
29512,West Indies,WOVEN BERRIES CUSHION COVER,2.0,4.95,2.0,4.95


In [40]:
# subsetting before pivoting
country_date_quantity = retail[['Country', 'date', 'Quantity']]
country_date_quantity.head()

Unnamed: 0,Country,date,Quantity
0,United Kingdom,2009-12-01 07:45:00,12
1,United Kingdom,2009-12-01 07:45:00,12
2,United Kingdom,2009-12-01 07:45:00,12
3,United Kingdom,2009-12-01 07:45:00,48
4,United Kingdom,2009-12-01 07:45:00,24


In [41]:
# pivoting
country_pivot = pd.pivot_table(
    country_date_quantity, 
    index = 'date', 
    columns = 'Country', 
    values = 'Quantity', 
    fill_value = 0,
    aggfunc = np.sum).reset_index()

country_pivot

#swapped NaN for 0 as NaN means there were no sales at a particular datetime

Country,date,Australia,Austria,Bahrain,Belgium,Brazil,Canada,Channel Islands,Cyprus,Czech Republic,...,Singapore,Spain,Sweden,Switzerland,Thailand,USA,United Arab Emirates,United Kingdom,Unspecified,West Indies
0,2009-12-01 07:45:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,826,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41434,2011-12-09 12:23:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,76,0,0
41435,2011-12-09 12:25:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,120,0,0
41436,2011-12-09 12:31:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,278,0,0
41437,2011-12-09 12:49:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,66,0,0


In [42]:
# subsetting just the UK sales
country_pivot[['date', 'United Kingdom']]

Country,date,United Kingdom
0,2009-12-01 07:45:00,166
1,2009-12-01 07:46:00,60
2,2009-12-01 09:06:00,193
3,2009-12-01 09:08:00,145
4,2009-12-01 09:24:00,826
...,...,...
41434,2011-12-09 12:23:00,76
41435,2011-12-09 12:25:00,120
41436,2011-12-09 12:31:00,278
41437,2011-12-09 12:49:00,66


In [43]:
# altering the subset of df to include Price
country_date_quantity = retail[['Country', 'date', 'Quantity', 'Price']]
country_date_quantity.head()

# pivoting
country_pivot = pd.pivot_table(
    country_date_quantity, 
    index = 'date', 
    columns = 'Country', 
    values = ['Quantity', 'Price'], 
    fill_value = 0,
    aggfunc = np.sum).reset_index()

# now the df has two levels - price and qty
country_pivot

Unnamed: 0_level_0,date,Price,Price,Price,Price,Price,Price,Price,Price,Price,...,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Country,Unnamed: 1_level_1,Australia,Austria,Bahrain,Belgium,Brazil,Canada,Channel Islands,Cyprus,Czech Republic,...,Singapore,Spain,Sweden,Switzerland,Thailand,USA,United Arab Emirates,United Kingdom,Unspecified,West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41434,2011-12-09 12:23:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,76,0,0
41435,2011-12-09 12:25:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,120,0,0
41436,2011-12-09 12:31:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,278,0,0
41437,2011-12-09 12:49:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,66,0,0


In [44]:
# pivoting with two different agg functions in aggfunc argument
country_pivot = pd.pivot_table(
    country_date_quantity, 
    index = 'date', 
    columns = 'Country', 
    values = ['Quantity', 'Price'], 
    fill_value = 0,
    aggfunc = {
        'Quantity': np.sum,
        'Price': np.mean
    }).reset_index()

country_pivot

Unnamed: 0_level_0,date,Price,Price,Price,Price,Price,Price,Price,Price,Price,...,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Country,Unnamed: 1_level_1,Australia,Austria,Bahrain,Belgium,Brazil,Canada,Channel Islands,Cyprus,Czech Republic,...,Singapore,Spain,Sweden,Switzerland,Thailand,USA,United Arab Emirates,United Kingdom,Unspecified,West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41434,2011-12-09 12:23:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,76,0,0
41435,2011-12-09 12:25:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,120,0,0
41436,2011-12-09 12:31:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,278,0,0
41437,2011-12-09 12:49:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,66,0,0


In [45]:
country_pivot.columns = country_pivot.columns.map('_'.join)
country_pivot

Unnamed: 0,date_,Price_Australia,Price_Austria,Price_Bahrain,Price_Belgium,Price_Brazil,Price_Canada,Price_Channel Islands,Price_Cyprus,Price_Czech Republic,...,Quantity_Singapore,Quantity_Spain,Quantity_Sweden,Quantity_Switzerland,Quantity_Thailand,Quantity_USA,Quantity_United Arab Emirates,Quantity_United Kingdom,Quantity_Unspecified,Quantity_West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41434,2011-12-09 12:23:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,76,0,0
41435,2011-12-09 12:25:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,120,0,0
41436,2011-12-09 12:31:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,278,0,0
41437,2011-12-09 12:49:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,66,0,0


In [46]:
pd.melt(country_pivot, id_vars = ['date_'], var_name = ['Measure'])

Unnamed: 0,date_,Measure,value
0,2009-12-01 07:45:00,Price_Australia,0.0
1,2009-12-01 07:46:00,Price_Australia,0.0
2,2009-12-01 09:06:00,Price_Australia,0.0
3,2009-12-01 09:08:00,Price_Australia,0.0
4,2009-12-01 09:24:00,Price_Australia,0.0
...,...,...,...
3397993,2011-12-09 12:23:00,Quantity_West Indies,0.0
3397994,2011-12-09 12:25:00,Quantity_West Indies,0.0
3397995,2011-12-09 12:31:00,Quantity_West Indies,0.0
3397996,2011-12-09 12:49:00,Quantity_West Indies,0.0


In [48]:
### joining

designation_data = pd.DataFrame({'name': ['mike', 'jonathan', 'Mo', 'Lisa', 'Raj', 'Teo'],
                                 'title': ['manager', 'supervisor', 'director', 'associate', 'assistant', 'sectionhead']})

age_data = pd.DataFrame({'name': ['mike', 'jonathan', 'lee', 'Lisa', 'tom', 'Teo'],
                                 'age': ['40', '50', '34', '52', '25', '80']})

In [49]:
designation_data

Unnamed: 0,name,title
0,mike,manager
1,jonathan,supervisor
2,Mo,director
3,Lisa,associate
4,Raj,assistant
5,Teo,sectionhead


In [50]:
age_data

Unnamed: 0,name,age
0,mike,40
1,jonathan,50
2,lee,34
3,Lisa,52
4,tom,25
5,Teo,80


In [51]:
# left join
pd.merge(designation_data, age_data, how = 'left')

Unnamed: 0,name,title,age
0,mike,manager,40.0
1,jonathan,supervisor,50.0
2,Mo,director,
3,Lisa,associate,52.0
4,Raj,assistant,
5,Teo,sectionhead,80.0


In [53]:
# full join
pd.merge(designation_data, age_data, how = 'outer')

Unnamed: 0,name,title,age
0,mike,manager,40.0
1,jonathan,supervisor,50.0
2,Mo,director,
3,Lisa,associate,52.0
4,Raj,assistant,
5,Teo,sectionhead,80.0
6,lee,,34.0
7,tom,,25.0


In [54]:
# inner join
pd.merge(designation_data, age_data, how = 'inner')

Unnamed: 0,name,title,age
0,mike,manager,40
1,jonathan,supervisor,50
2,Lisa,associate,52
3,Teo,sectionhead,80
