In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
from collections import defaultdict
import os

In [2]:
df = pd.read_csv(open(os.path.expanduser("~/documents/data/attribution.csv")))
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(['cookie', 'time'],
                    ascending=[False, True])
df['order'] = df.groupby('cookie').cumcount() + 1
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,order
586736,ooooohAFofEnonEikhAi3fF9o,2018-07-14 17:17:12+00:00,impression,0,0.0,Paid Search,1
586734,ooooiBh70D3k3BfAhDFfii9h7,2018-07-03 12:57:25+00:00,impression,0,0.0,Paid Search,1
586735,ooooiBh70D3k3BfAhDFfii9h7,2018-07-19 08:17:59+00:00,impression,0,0.0,Online Video,2
586731,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-06 23:30:38+00:00,impression,0,0.0,Online Display,1
586732,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-12 23:50:45+00:00,impression,0,0.0,Online Display,2


In [3]:
# For ease of future analysis, I will be changing the DataFrame from wide-form (multiple rows for one Cookie) to long-form

In [4]:
df_paths = df.groupby('cookie')['channel'].aggregate(
    lambda x: x.unique().tolist()).reset_index()

df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

df_paths['path'] = np.where(df_paths['conversion'] == 0,
                ['Start, '] + df_paths['channel'].apply(', '.join) + [', Null'],
                ['Start, '] + df_paths['channel'].apply(', '.join) + [', Conversion'])


df_paths['path'] = df_paths['path'].str.split(', ')

df_paths = df_paths[['cookie', 'path']]
df_paths.head()

Unnamed: 0,cookie,path
0,00000FkCnDfDDf0iC97iC703B,"[Start, Instagram, Online Display, Null]"
1,0000nACkD9nFkBBDECD3ki00E,"[Start, Paid Search, Null]"
2,0003EfE37E93D0BC03iBhBBhF,"[Start, Paid Search, Null]"
3,00073CFE3FoFCn70fBhB3kfon,"[Start, Instagram, Null]"
4,00079hhBkDF3k3kDkiFi9EFAD,"[Start, Paid Search, Null]"


In [5]:
# The output gives a single Cookie (unique customer ID) and their path, from first to last touchpoint

In [6]:
# Using 'np.where' lets us specify that a non-converted customer will have their journey end in 'Null'

In [7]:
paths_list = df_paths['path']

def transition(paths_list):
    unique_channel_list = set(x for element in paths_list for x in element)
    transition = {x + '>' + y: 0 for x in unique_channel_list for y in unique_channel_list}

    for possible_state in unique_channel_list:
        if possible_state not in ['Conversion', 'Null']:
            for user_path in paths_list:
                if possible_state in user_path:
                    indices = [i for i, s in enumerate(user_path) if possible_state in s]
                    for col in indices:
                        transition[user_path[col] + '>' + user_path[col + 1]] += 1

    return transition

transition_list = transition(paths_list)

In [8]:
# Next, I form a Probabilistic Model for Transitions

In [9]:
def p_transition(transition_dict):
    unique_channel_list = set(x for element in paths_list for x in element)
    p_transition = defaultdict(dict)
    for state in unique_channel_list:
        if state not in ['Conversion', 'Null']:
            counter = 0
            index = [i for i, s in enumerate(transition_dict) if state + '>' in s]
            for col in index:
                if transition_dict[list(transition_dict)[col]] > 0:
                    counter += transition_dict[list(transition_dict)[col]]
            for col in index:
                if transition_dict[list(transition_dict)[col]] > 0:
                    state_prob = float((transition_dict[list(transition_dict)[col]])) / float(counter)
                    p_transition[list(transition_dict)[col]] = state_prob

    return p_transition

p_transition = p_transition(transition_list)

In [10]:
# Matrix Transformation of Probability Model

def matrix(paths_list, transition_probabilities):
    matrix = pd.DataFrame()
    unique_channel_list = set(x for element in paths_list for x in element)

    for channel in unique_channel_list:
        matrix[channel] = 0.00
        matrix.loc[channel] = 0.00
        matrix.loc[channel][channel] = 1.0 if channel in ['Conversion', 'Null'] else 0.0

    for key, value in transition_probabilities.items():
        origin, destination = key.split('>')
        matrix.at[origin, destination] = value

    return matrix

matrix = matrix(paths_list, p_transition)


In [11]:
conversion_total = sum(path.count('Conversion') for path in df_paths['path'].tolist())
conversion_rate = conversion_total / len(paths_list)

def removal_effects(df, conversion_rate):
    removal_effects_dict = {}
    channels = [channel for channel in df.columns if channel not in ['Start',
                                                                     'Null',
                                                                     'Conversion']]
    for channel in channels:
        removals = df.drop(channel, axis=1).drop(channel, axis=0)
        for column in removals.columns:
            row_sum = np.sum(list(removals.loc[column]))
            null_pct = float(1) - row_sum
            if null_pct != 0:
                removals.loc[column]['Null'] = null_pct
            removals.loc['Null']['Null'] = 1.0

        conversion_removals = removals[
            ['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
        non_conversion_removals = removals.drop(
            ['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)

        removal_inv_diff = np.linalg.inv(
            np.identity(
                len(non_conversion_removals)) - np.asarray(non_conversion_removals))
        removal_dot_prod = np.dot(removal_inv_diff, np.asarray(conversion_removals))
        removal_cvr = pd.DataFrame(removal_dot_prod,
                                   index=conversion_removals.index)[[1]].loc['Start'].values[0]
        removal_effect = 1 - removal_cvr / conversion_rate
        removal_effects_dict[channel] = removal_effect

    return removal_effects_dict

removal_effects_dict = removal_effects(matrix, conversion_rate)

In [12]:
def markov_chain_allocations(removal_effects, conversion_total):
    re_sum = np.sum(list(removal_effects.values()))

    return {k: (v / re_sum) * conversion_total for k, v in removal_effects.items()}

attribution = markov_chain_allocations(removal_effects_dict, conversion_total)

print(attribution)

{'Online Display': 2153.2469267590836, 'Instagram': 3031.521548555893, 'Facebook': 4948.892177847523, 'Online Video': 2886.4480895461456, 'Paid Search': 4618.891257291356}


In [14]:
attribution = pd.Series(attribution).sort_values()
print(attribution)

Online Display    2153.246927
Online Video      2886.448090
Instagram         3031.521549
Paid Search       4618.891257
Facebook          4948.892178
dtype: float64


In [15]:
# As we can see, Facebook is assigned the highest number of credits for this dataset. If this data is representative
# of the entire user base then Facebook would be deemed the most valuable touchpoint for a given journey.

In [18]:
# Paid Search is close to being as important a touchpoint in a given user journey ; the drop-off past these two is
# significant with Instagram, Online Video and Online Display being much less prominent. 

In [22]:
(attribution / sum(attribution)).apply('{:.0%}'.format)

Online Display    12%
Online Video      16%
Instagram         17%
Paid Search       26%
Facebook          28%
dtype: object

In [20]:
# If we take a converted customer journey at random, we expect that more than half of the assigned credit to be
# attributed to Facebook and Paid Search, alone. The expected credit for Facebook is equal to the combined credit
# that we expect for Online Display and Onlinee Video.