# Task 2 

In [21]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
# Load data
data = pd.read_csv('QVI_data.csv')
data

Unnamed: 0,LYLTY_CARD_NBR,DATE,STORE_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES,PACK_SIZE,BRAND,LIFESTAGE,PREMIUM_CUSTOMER
0,1000,2018-10-17,1,1,5,Natural Chip Compny SeaSalt175g,2,6.0,175,NATURAL,YOUNG SINGLES/COUPLES,Premium
1,1002,2018-09-16,1,2,58,Red Rock Deli Chikn&Garlic Aioli 150g,1,2.7,150,RRD,YOUNG SINGLES/COUPLES,Mainstream
2,1003,2019-03-07,1,3,52,Grain Waves Sour Cream&Chives 210G,1,3.6,210,GRNWVES,YOUNG FAMILIES,Budget
3,1003,2019-03-08,1,4,106,Natural ChipCo Hony Soy Chckn175g,1,3.0,175,NATURAL,YOUNG FAMILIES,Budget
4,1004,2018-11-02,1,5,96,WW Original Stacked Chips 160g,1,1.9,160,WOOLWORTHS,OLDER SINGLES/COUPLES,Mainstream
...,...,...,...,...,...,...,...,...,...,...,...,...
264829,2370701,2018-12-08,88,240378,24,Grain Waves Sweet Chilli 210g,2,7.2,210,GRNWVES,YOUNG FAMILIES,Mainstream
264830,2370751,2018-10-01,88,240394,60,Kettle Tortilla ChpsFeta&Garlic 150g,2,9.2,150,KETTLE,YOUNG FAMILIES,Premium
264831,2370961,2018-10-24,88,240480,70,Tyrrells Crisps Lightly Salted 165g,2,8.4,165,TYRRELLS,OLDER FAMILIES,Budget
264832,2370961,2018-10-27,88,240481,65,Old El Paso Salsa Dip Chnky Tom Ht300g,2,10.2,300,OLD,OLDER FAMILIES,Budget


In [27]:
# Check for null values
data.isna().value_counts()

LYLTY_CARD_NBR  DATE   STORE_NBR  TXN_ID  PROD_NBR  PROD_NAME  PROD_QTY  TOT_SALES  PACK_SIZE  BRAND  LIFESTAGE  PREMIUM_CUSTOMER  MONTH
False           False  False      False   False     False      False     False      False      False  False      False             False    264834
Name: count, dtype: int64

# Select control stores

We will start by defining a few metrics by which we will select stores. As the trial stores need to be similar to the trial stores, we will match based on these metrics. In order to do this we will need to transform the data first.

- Create a month column in the format of yyyymm

In [30]:
# Convert date column to datetime
data['DATE'] = pd.to_datetime(data['DATE'])

# Add month column in the format of yyyymm
data['MONTH'] = data['DATE'].dt.strftime('%Y%m').astype(int)
data.head()

Unnamed: 0,LYLTY_CARD_NBR,DATE,STORE_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES,PACK_SIZE,BRAND,LIFESTAGE,PREMIUM_CUSTOMER,MONTH
0,1000,2018-10-17,1,1,5,Natural Chip Compny SeaSalt175g,2,6.0,175,NATURAL,YOUNG SINGLES/COUPLES,Premium,201810
1,1002,2018-09-16,1,2,58,Red Rock Deli Chikn&Garlic Aioli 150g,1,2.7,150,RRD,YOUNG SINGLES/COUPLES,Mainstream,201809
2,1003,2019-03-07,1,3,52,Grain Waves Sour Cream&Chives 210G,1,3.6,210,GRNWVES,YOUNG FAMILIES,Budget,201903
3,1003,2019-03-08,1,4,106,Natural ChipCo Hony Soy Chckn175g,1,3.0,175,NATURAL,YOUNG FAMILIES,Budget,201903
4,1004,2018-11-02,1,5,96,WW Original Stacked Chips 160g,1,1.9,160,WOOLWORTHS,OLDER SINGLES/COUPLES,Mainstream,201811


- Aggregate the data into the monthly amounts of each column split by Store number

In [31]:
measureOverTime = data.groupby(["STORE_NBR", 'MONTH']).agg(
    totSales=('TOT_SALES', 'sum'),
    nCustomers=('LYLTY_CARD_NBR', 'nunique'),
    nTxnPerCust=('TXN_ID', 'count'),
    nChipsPerTxn=('PROD_QTY', 'sum'),
).reset_index()

# Calculate the average price per unit
measureOverTime['avgPricePerUnit'] = measureOverTime['totSales'] / measureOverTime['nChipsPerTxn']

# Sort by month
measureOverTime = measureOverTime.sort_values(['STORE_NBR','MONTH'])

measureOverTime

Unnamed: 0,STORE_NBR,MONTH,totSales,nCustomers,nTxnPerCust,nChipsPerTxn,avgPricePerUnit
0,1,201807,206.9,49,52,62,3.337097
1,1,201808,176.1,42,43,54,3.261111
2,1,201809,278.8,59,62,75,3.717333
3,1,201810,188.1,44,45,58,3.243103
4,1,201811,192.6,46,47,57,3.378947
...,...,...,...,...,...,...,...
3164,272,201902,395.5,45,48,91,4.346154
3165,272,201903,442.3,50,53,101,4.379208
3166,272,201904,445.1,54,56,105,4.239048
3167,272,201905,314.6,34,40,71,4.430986


- Now we can filter the data between the pre-trial data and the full observation period stores

In [32]:
# Full observation data 

# Find stores that were observed for the full period
observe_counts = measureOverTime['STORE_NBR'].value_counts()
observe_index = observe_counts[observe_counts == 12].index

# Filter into new dataframe with only the stores found
storesWithFullObs = measureOverTime[measureOverTime['STORE_NBR'].isin(observe_index)]
storesWithFullObs.head()

Unnamed: 0,STORE_NBR,MONTH,totSales,nCustomers,nTxnPerCust,nChipsPerTxn,avgPricePerUnit
0,1,201807,206.9,49,52,62,3.337097
1,1,201808,176.1,42,43,54,3.261111
2,1,201809,278.8,59,62,75,3.717333
3,1,201810,188.1,44,45,58,3.243103
4,1,201811,192.6,46,47,57,3.378947


In [33]:
# Pre-trial data

# Filter all stores in the full observation to only have the data before the trial period
preTrialMeasures = storesWithFullObs[storesWithFullObs['MONTH'] < 201902]
preTrialMeasures.head()

Unnamed: 0,STORE_NBR,MONTH,totSales,nCustomers,nTxnPerCust,nChipsPerTxn,avgPricePerUnit
0,1,201807,206.9,49,52,62,3.337097
1,1,201808,176.1,42,43,54,3.261111
2,1,201809,278.8,59,62,75,3.717333
3,1,201810,188.1,44,45,58,3.243103
4,1,201811,192.6,46,47,57,3.378947


Now that our data is filtered we can rank the similarity between control and trial stores

- Calculate correlation

In [50]:
# Correlation calculation
def calcCorrTable (metrics, comparison, preTrialMeasures = preTrialMeasures):
    # Create table of the trial stores
    controlStore_NBRs = preTrialMeasures[~preTrialMeasures['STORE_NBR'].isin([77, 86, 88])]['STORE_NBR'].unique()
    corr_table = pd.DataFrame(columns= ['MONTH', 'TRIAL_STORE', 'CONTROL_STORE', 'CORRELATION_SCORE'])
    trialStore = preTrialMeasures[preTrialMeasures['STORE_NBR'] == comparison][metrics].reset_index()
    for control in controlStore_NBRs:
        storage_df = pd.DataFrame(columns= ['MONTH', 'TRIAL_STORE', 'CONTROL_STORE', 'CORRELATION_SCORE'])
        control_store = preTrialMeasures['STORE_NBR' == control][metrics].reset_index()
        storage_df['MONTH'] = list(preTrialMeasures[preTrialMeasures['STORE_NBR'] == comparison]['MONTH'])
        storage_df['TRIAL_STORE'] = comparison
        storage_df['CONTROL_STORE'] =   control
        storage_df['CORRELATION_SCORE'] = trialStore.corrwith(control_store, axis=1)
        corr_table = pd.concat([corr_table, storage_df])
    return corr_table

In [51]:
# Create correlation table
corr_table = pd.DataFrame(columns= ['MONTH', 'TRIAL_STORE', 'CONTROL_STORE', 'CORRELATION_SCORE'])
for trial_num in [77, 86, 88]:
    corr_table = pd.concat([corr_table, calcCorrTable(["totSales", "nCustomers", "nTxnPerCust", "nChipsPerTxn", "avgPricePerUnit"], trial_num)])

corr_table.head(8)

KeyError: False