<a href="https://colab.research.google.com/github/iampatgrady/Colaboratory-Analytics-Demos/blob/master/Shapley_MTA_using_UA360_Data_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measuring MTA using Shapley



# Set Up

In [1]:
#@markdown Load Dependencies
import datetime
import numpy as np
import pandas as pd
import glob

import itertools
import requests

import seaborn as sns
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 11.7,8.27
#!pip install gcsfs
from IPython.display import clear_output
#clear_output()

pd.options.plotting.backend = "plotly"

import plotly.express as px

In [2]:
# @markdown Authentication
is_hosted = True #@param {type:"boolean"}
import pydata_google_auth

from google.colab import auth

if is_hosted:

  auth.authenticate_user()
  credentials = None
  print('Hosted - Authenticated')

else:
  SCOPES = [
    'https://www.googleapis.com/auth/cloud-platform',
    'https://www.googleapis.com/auth/drive',
  ]

  credentials = pydata_google_auth.get_user_credentials(
      SCOPES,
      # Set auth_local_webserver to True to have a slightly more convienient
      # authorization flow. Note, this doesn't work if you're running from a
      # notebook on a remote sever, such as over SSH or with Google Colab.
      auth_local_webserver=False,
  )
  print('Local - Authenticated')


Hosted - Authenticated


In [54]:
#@markdown Variables
billable_id='billable-project-here' #@param {type:"string"}
project_id='bigquery-public-data' #@param {type:"string"}
#bucket_id='as-dev-pat-mta-test' #@param {type:"string"}
view_id='google_analytics_sample' #@param {type:"string"}
start_date = "2016-08-01" #@param {type:"date"}
end_date = "2017-08-01" #@param {type:"date"}
start_date = start_date.replace("-","")
end_date = end_date.replace("-","")
#artifact_prefix='mta_itp_test' #@param {type:"string"}
from google.cloud.bigquery import magics
magics.context.project = billable_id

# Query Data

Note:  these queries can require high memory to store

In [34]:
#@markdown Query `sql` to `df`
unscrambled_sql = '''
WITH
  data as (
    SELECT --DISTINCT
      fullVisitorId,
      visitStartTime,
      channelGrouping,
      IF(totals.transactions IS NOT NULL, 1, 0) as conversion
    FROM
        `{0}.{1}.ga_sessions_*`, UNNEST(hits) as hits
    WHERE
       _TABLE_SUFFIX BETWEEN '{2}'
       AND '{3}'
  )
SELECT
  fullVisitorId,
  ARRAY_AGG(DISTINCT channelGrouping) AS channels,
  ARRAY_AGG(channelGrouping ORDER BY visitStartTime ASC) as channels_order, -- for shapley values
  MAX(conversion) AS converted
FROM (
  SELECT
    *,
    LAG(channelGrouping) OVER(partition by fullvisitorId ORDER BY visitStartTime) as channelPrev,
    SUM(conversion) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime DESC) AS conversionGroup,
  FROM data
)
WHERE channelGrouping != channelPrev OR channelPrev IS NULL
GROUP BY fullVisitorId, conversionGroup;
'''

sql = unscrambled_sql.format(project_id,view_id,start_date,end_date)
df = pd.io.gbq.read_gbq(
    query=sql,
    project_id=billable_id,
    use_bqstorage_api=True,
    progress_bar_type='tqdm_notebook',
    credentials=credentials or None
  )
df.info()

Downloading:   0%|          |

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714805 entries, 0 to 714804
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   fullVisitorId   714805 non-null  object
 1   channels        714805 non-null  object
 2   channels_order  714805 non-null  object
 3   converted       714805 non-null  Int64 
dtypes: Int64(1), object(3)
memory usage: 22.5+ MB


# Shapley

In [36]:
#@markdown Shapley Functions
from itertools import chain, combinations
from tqdm import tqdm
from collections import Counter


class SimplifiedShapleyAttributionModel:
    def powerset(self, x):
        s = list(x)
        return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))

    def _phi(self, channel_index):
        S_channel = [k for k in self.journeys.keys() if channel_index in k]
        score = 0
        print(f"Computing phi for channel {channel_index}...")
        for S in tqdm(S_channel):
            score += self.journeys[S] / len(S)
        print(f"Attribution score for channel {channel_index}: {score:.2f}")
        print()
        return score

    def attribute(self, journeys):
        self.P = set(chain(*journeys))
        print("Running Simplified Shapley Attribution Model...")
        print(f"Found {len(self.P)} unique channels!")

        print("Computing journey statistics...")
        self.journeys = Counter([frozenset(journey) for journey in journeys])

        print(f"Computing attributions...")
        print()
        return {j: self._phi(j) for j in self.P}



class OrderedShapleyAttributionModel:
    def __init__(self):
        self.P = set()

    def powerset(self, x):
        s = list(x)
        return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))

    def _r(self, S, channel_index, touchpoint_index):
        return sum(
            [
                1 / journey.count(channel_index)
                if (
                    (S == journey_set)
                    and (journey[touchpoint_index - 1] == channel_index)
                )
                else 0
                for journey, journey_set in self.indexed_journeys[len(S)]
                if touchpoint_index <= len(journey)
            ]
        )

    def _phi(self, channel_index, touchpoint_index):
        S_all = [set(S) for S in self.P_power if channel_index in S]
        score = 0
        print(
            f"Computing phi for channel {channel_index}, touchpoint {touchpoint_index}..."
        )
        for S in tqdm(S_all):
            score += self._r(S, channel_index, touchpoint_index) / len(S)
        print(
            f"Attribution score for channel {channel_index}, touchpoint {touchpoint_index}: {score:.2f}"
        )
        print()
        return score

    def attribute(self, journeys):
        self.P = set(chain(*journeys))
        print("Running Ordered Shapley Attribution Model...")
        print(f"Found {len(self.P)} unique channels!")
        self.P_power = list(self.powerset(self.P))
        self.N = max([len(journey) for journey in journeys])
        print(f"Found {self.N} maximum touchpoints!")
        self.journeys = journeys
        self.indexed_journeys = {
            i: [(S, set(S)) for S in self.journeys if len(set(S)) == i]
            for i in range(1, len(self.P) + 1)
        }
        print(f"Proceeding to attribution computation...")
        print()
        return {j: [self._phi(j, i) for i in range(1, self.N + 1)] for j in self.P}

In [37]:
df_convs = df.loc[df.converted==1].reset_index(drop=True).copy()

In [38]:
DICT_NUM_CHANNEL = {}
DICT_CHANNEL_NUM = {}

channels = set()
for r in df_convs.channels:
  for c in r:
    channels.add(c)

for i, c in enumerate(channels):
  DICT_NUM_CHANNEL[i] = c
  DICT_CHANNEL_NUM[c] = i

In [39]:
df_convs

Unnamed: 0,fullVisitorId,channels,channels_order,converted
0,5137390233868271533,"[Referral, Direct]","[Direct, Referral]",1
1,0225621684999318305,"[Referral, Display]","[Display, Referral]",1
2,0395050647250429598,"[Organic Search, Referral]","[Organic Search, Referral]",1
3,1386841622492700501,"[Direct, Referral]","[Direct, Referral]",1
4,4880074144243472885,"[Referral, Organic Search]","[Referral, Organic Search]",1
...,...,...,...,...
5581,1520999328918083380,[Organic Search],[Organic Search],1
5582,0197277329550470329,[Organic Search],[Organic Search],1
5583,0883217425810297944,[Organic Search],[Organic Search],1
5584,1208436519613837144,[Organic Search],[Organic Search],1


# Ordered Shapley

In [41]:
import numpy as np
orderedShapMaxPathLength = 4
ordered_shap_paths_type = []
for i in df_convs.channels_order:
    tmp_tps = i.tolist()
    if len(i) <= orderedShapMaxPathLength:
        fillCount = orderedShapMaxPathLength - len(i)
        for j in range(0, fillCount):
            tmp_tps = ["NA"] + tmp_tps
    else:  # trim the path to have the last N touchpoints
        tmp_tps = tmp_tps[-orderedShapMaxPathLength:]
    ordered_shap_paths_type.append(tmp_tps)

ordered_shap = OrderedShapleyAttributionModel()
att_paths_ord = ordered_shap.attribute(ordered_shap_paths_type)
nas = att_paths_ord.pop('NA')

Running Ordered Shapley Attribution Model...
Found 9 unique channels!
Found 4 maximum touchpoints!
Proceeding to attribution computation...

Computing phi for channel Display, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 18753.35it/s]


Attribution score for channel Display, touchpoint 1: 0.50

Computing phi for channel Display, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 17056.52it/s]


Attribution score for channel Display, touchpoint 2: 2.50

Computing phi for channel Display, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 16468.43it/s]


Attribution score for channel Display, touchpoint 3: 14.58

Computing phi for channel Display, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 17533.63it/s]


Attribution score for channel Display, touchpoint 4: 28.58

Computing phi for channel (Other), touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 15138.26it/s]


Attribution score for channel (Other), touchpoint 1: 0.00

Computing phi for channel (Other), touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 19020.10it/s]


Attribution score for channel (Other), touchpoint 2: 0.00

Computing phi for channel (Other), touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 21502.79it/s]


Attribution score for channel (Other), touchpoint 3: 0.00

Computing phi for channel (Other), touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 20595.81it/s]


Attribution score for channel (Other), touchpoint 4: 0.50

Computing phi for channel Social, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 13521.83it/s]


Attribution score for channel Social, touchpoint 1: 0.00

Computing phi for channel Social, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 13604.76it/s]


Attribution score for channel Social, touchpoint 2: 0.25

Computing phi for channel Social, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 10629.32it/s]


Attribution score for channel Social, touchpoint 3: 3.67

Computing phi for channel Social, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 14061.76it/s]


Attribution score for channel Social, touchpoint 4: 30.42

Computing phi for channel Affiliates, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 19010.00it/s]


Attribution score for channel Affiliates, touchpoint 1: 0.00

Computing phi for channel Affiliates, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 18164.22it/s]


Attribution score for channel Affiliates, touchpoint 2: 0.00

Computing phi for channel Affiliates, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 17080.12it/s]


Attribution score for channel Affiliates, touchpoint 3: 0.92

Computing phi for channel Affiliates, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 17759.54it/s]


Attribution score for channel Affiliates, touchpoint 4: 2.33

Computing phi for channel Direct, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 13860.28it/s]


Attribution score for channel Direct, touchpoint 1: 1.00

Computing phi for channel Direct, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 15542.10it/s]


Attribution score for channel Direct, touchpoint 2: 3.50

Computing phi for channel Direct, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 17191.42it/s]


Attribution score for channel Direct, touchpoint 3: 84.25

Computing phi for channel Direct, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 15756.26it/s]


Attribution score for channel Direct, touchpoint 4: 506.17

Computing phi for channel Organic Search, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 14109.80it/s]


Attribution score for channel Organic Search, touchpoint 1: 3.08

Computing phi for channel Organic Search, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 18037.59it/s]


Attribution score for channel Organic Search, touchpoint 2: 7.75

Computing phi for channel Organic Search, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 16528.51it/s]


Attribution score for channel Organic Search, touchpoint 3: 77.17

Computing phi for channel Organic Search, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 15696.60it/s]


Attribution score for channel Organic Search, touchpoint 4: 926.67

Computing phi for channel Referral, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 15125.25it/s]


Attribution score for channel Referral, touchpoint 1: 1.25

Computing phi for channel Referral, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 17335.19it/s]


Attribution score for channel Referral, touchpoint 2: 7.67

Computing phi for channel Referral, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 15541.65it/s]


Attribution score for channel Referral, touchpoint 3: 22.75

Computing phi for channel Referral, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 16004.02it/s]


Attribution score for channel Referral, touchpoint 4: 1024.50

Computing phi for channel Paid Search, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 12679.39it/s]


Attribution score for channel Paid Search, touchpoint 1: 1.08

Computing phi for channel Paid Search, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 11342.78it/s]


Attribution score for channel Paid Search, touchpoint 2: 1.50

Computing phi for channel Paid Search, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 18466.31it/s]


Attribution score for channel Paid Search, touchpoint 3: 33.50

Computing phi for channel Paid Search, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 13636.38it/s]


Attribution score for channel Paid Search, touchpoint 4: 138.50

Computing phi for channel NA, touchpoint 1...


100%|██████████| 256/256 [00:00<00:00, 13464.18it/s]


Attribution score for channel NA, touchpoint 1: 938.58

Computing phi for channel NA, touchpoint 2...


100%|██████████| 256/256 [00:00<00:00, 18904.90it/s]


Attribution score for channel NA, touchpoint 2: 912.17

Computing phi for channel NA, touchpoint 3...


100%|██████████| 256/256 [00:00<00:00, 12122.54it/s]


Attribution score for channel NA, touchpoint 3: 810.67

Computing phi for channel NA, touchpoint 4...


100%|██████████| 256/256 [00:00<00:00, 15938.22it/s]

Attribution score for channel NA, touchpoint 4: 0.00






In [43]:
import pandas as pd
df_ordered_shap = pd.DataFrame.from_dict(att_paths_ord, orient='index')[range(0,orderedShapMaxPathLength)].sort_values(by=0, ascending=False).T
df_ordered_shap = df_ordered_shap * len(df) / df_ordered_shap.sum().sum()
df_ordered_shap

Unnamed: 0,Organic Search,Referral,Paid Search,Direct,Display,(Other),Social,Affiliates
0,753.605499,305.515743,264.780311,244.412594,122.206297,0.0,0.0,0.0
1,1894.197606,1873.82989,366.618892,855.44408,611.031486,0.0,61.103149,0.0
2,18860.5052,5560.386522,8187.821912,20591.761077,3564.350335,0.0,896.179513,224.044878
3,226489.004132,250400.702949,33851.144323,123713.508192,6986.126656,122.206297,7434.216413,570.296054


In [44]:
fig = px.bar(df_ordered_shap, barmode='group')
fig.show()

In [45]:
fig = px.bar(df_ordered_shap, barmode='stack')
fig.update_xaxes(title='position')
fig.update_layout(title="Ordered Shap Positional Distribution")

fig.show()

fig = px.bar(df_ordered_shap, barmode='stack')
fig.update_xaxes(title='position')
fig.update_layout(title="Ordered Shap Positional Distribution Percent", barmode='relative', barnorm='percent')

fig.show()

# Simply Shapley

In [46]:
DICT_NUM_CHANNEL = {}
DICT_CHANNEL_NUM = {}

channels = set()
for r in df_convs.channels:
  for c in r:
    channels.add(c)

for i, c in enumerate(channels):
  DICT_NUM_CHANNEL[i] = c
  DICT_CHANNEL_NUM[c] = i

In [47]:
DICT_CHANNEL_NUM

{'Display': 0,
 '(Other)': 1,
 'Social': 2,
 'Affiliates': 3,
 'Direct': 4,
 'Organic Search': 5,
 'Referral': 6,
 'Paid Search': 7}

In [48]:
channel_paths = []
for p in df_convs.channels:
  channel_paths.append(
      [DICT_CHANNEL_NUM[c] for c in p]
  )

channel_paths[:5]

[[6, 4], [6, 0], [5, 6], [4, 6], [6, 5]]

In [49]:
shap = SimplifiedShapleyAttributionModel() # using the simple one that doesn't care about the order due to mem issues if I use the ordered one
att = shap.attribute(channel_paths)

Running Simplified Shapley Attribution Model...
Found 8 unique channels!
Computing journey statistics...
Computing attributions...

Computing phi for channel 0...


100%|██████████| 14/14 [00:00<00:00, 102122.18it/s]


Attribution score for channel 0: 79.75

Computing phi for channel 1...


100%|██████████| 1/1 [00:00<00:00, 2902.63it/s]


Attribution score for channel 1: 1.00

Computing phi for channel 2...


100%|██████████| 8/8 [00:00<00:00, 81049.35it/s]


Attribution score for channel 2: 63.17

Computing phi for channel 3...


100%|██████████| 5/5 [00:00<00:00, 50533.78it/s]


Attribution score for channel 3: 5.67

Computing phi for channel 4...


100%|██████████| 12/12 [00:00<00:00, 109179.28it/s]


Attribution score for channel 4: 1143.42

Computing phi for channel 5...


100%|██████████| 19/19 [00:00<00:00, 159383.55it/s]


Attribution score for channel 5: 1949.42

Computing phi for channel 6...


100%|██████████| 17/17 [00:00<00:00, 135815.56it/s]


Attribution score for channel 6: 2024.33

Computing phi for channel 7...


100%|██████████| 13/13 [00:00<00:00, 89240.51it/s]

Attribution score for channel 7: 319.25






In [50]:
print (f"Number of all conversions: {len(df_convs)}")
print (f"Total SUM of all attributions: { sum([v for k, v in att.items()]) }")
print ('[Number should be the same] Confirming Shapley value for each channel is how many conversions a channel is responsible for!')

Number of all conversions: 5586
Total SUM of all attributions: 5586.0
[Number should be the same] Confirming Shapley value for each channel is how many conversions a channel is responsible for!


In [51]:
df_shap = []
for k, v in att.items():
  df_shap.append(
      {'channel': DICT_NUM_CHANNEL[k], 'shapleyValue': v}
  )
df_shap = pd.DataFrame(df_shap)
df_shap.set_index('channel', inplace=True)
#df_shap['pctResponsibleConversions'] = 100 * (df_shap.shapleyValue / len(df_shap))
df_shap['pctResponsibleConversions'] = 100 * df_shap.shapleyValue / df_shap.shapleyValue.sum()
df_shap

Unnamed: 0_level_0,shapleyValue,pctResponsibleConversions
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Display,79.75,1.427676
(Other),1.0,0.017902
Social,63.166667,1.130803
Affiliates,5.666667,0.101444
Direct,1143.416667,20.469328
Organic Search,1949.416667,34.898258
Referral,2024.333333,36.239408
Paid Search,319.25,5.715181


In [52]:
df_shap.pctResponsibleConversions.sum()

100.00000000000001

In [53]:
df_shap[['pctResponsibleConversions']].sort_values(by='pctResponsibleConversions').plot(kind='barh')