In [1]:
##Load the package
import re
import datetime
from pychattr.channel_attribution import MarkovModel
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [4]:
##Load the data
df = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/tp_analysis_base.csv',sep = '\t',header = None) # 大数表
id_mapping = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/id_mapping.csv') # 出点码表

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,13000000133,002011005004_tp,2020-10-28 11:16:45,1,,,,
1,13000001122,009002001000_tp,2020-09-11 11:08:51,17,2020-09-10 16:42:19,009002003000_tp,17.0,-1.0
2,13000009999,009002001000_tp,2020-06-17 21:51:59,25,2020-06-12 22:11:50,009002001000_tp,25.0,-5.0
3,13001035033,002009003001_tp,2020-06-18 13:33:55,24,2020-06-18 13:33:32,002009001003_tp,24.0,0.0
4,13001055006,009002001000_tp,2020-11-02 09:38:09,10,2020-10-28 10:46:06,002011005004_tp,10.0,-5.0


In [6]:
df = df.rename(columns = {0:'mobile',1:'touchpoint_id',2:'action_time'})

In [7]:
df = df[['mobile','action_time','touchpoint_id']]
df.head()

Unnamed: 0,mobile,action_time,touchpoint_id
0,13000000133,2020-10-28 11:16:45,002011005004_tp
1,13000001122,2020-09-11 11:08:51,009002001000_tp
2,13000009999,2020-06-17 21:51:59,009002001000_tp
3,13001035033,2020-06-18 13:33:55,002009003001_tp
4,13001055006,2020-11-02 09:38:09,009002001000_tp


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55565405 entries, 0 to 55565404
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         int64 
 1   action_time    object
 2   touchpoint_id  object
dtypes: int64(1), object(2)
memory usage: 1.2+ GB


In [9]:
def preprocess(df):
    df.dropna(inplace = True)
    df['mobile'] = df['mobile'].astype('string')
    ##filter the uncleaned mobile
    df = df[df.mobile.str.contains('^1\d{10}$')]
    ##phone number of dealer to remove
    mobile_to_remove = pd.read_csv('mobile_to_remove.csv').drop(columns =[ 'Unnamed: 0'])
    mobile_to_remove_list = mobile_to_remove.mobile.unique()
    df = df[~df.mobile.isin(mobile_to_remove_list)]
    ##删除无效节点：005000000000_tp，004000000000_tp
    df = df[~df.touchpoint_id.isin(['005000000000_tp','004000000000_tp'])]
    return df

In [10]:
df = preprocess(df)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49830374 entries, 0 to 55565402
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         string
 1   action_time    object
 2   touchpoint_id  object
dtypes: object(2), string(1)
memory usage: 1.5+ GB


In [13]:
# link consumer journey
df = df.sort_values(['mobile', 'action_time'],ascending=[False, True])
df_paths = df.groupby('mobile')['touchpoint_id'].aggregate(lambda x: x.tolist()).reset_index()

In [14]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id
0,13000000003,"[009002002000_tp, 008002007000_tp, 00201100500..."
1,13000000004,"[002011002000_tp, 002011002000_tp]"
2,13000000005,"[009002004000_tp, 008002007000_tp, 00201100500..."
3,13000000008,"[002011005001_tp, 002011005004_tp, 00201100400..."
4,13000000010,"[002011005001_tp, 002011005003_tp, 00201100500..."


In [15]:
def if_has_endpoint(touchpoint_id,list1):
    if touchpoint_id in list1:
        return 1
    else:
        return 0

def exclude_endpoint(touchpoint_id,list1):
    if touchpoint_id in list1:
        list1 = list1[:list1.index(touchpoint_id)]
    return list1

In [16]:
##只保留战败用户
df_paths['if_ever_failed'] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint('014004000000_tp',x))

In [17]:
df_paths.if_ever_failed.value_counts()

0    2779046
1     808766
Name: if_ever_failed, dtype: int64

In [18]:
df_paths = df_paths[df_paths.if_ever_failed == 1][['mobile','touchpoint_id']]
df_paths.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 808766 entries, 0 to 3587809
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   mobile         808766 non-null  object
 1   touchpoint_id  808766 non-null  object
dtypes: object(2)
memory usage: 18.5+ MB


In [19]:
##产生战败唤醒的标签
df_paths['conversion'] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint('015000000000_tp',x))
df_paths.conversion.value_counts()

0    657431
1    151335
Name: conversion, dtype: int64

In [20]:
##删除空路径并且只保留目标节点之前的路径
df_paths.loc[df_paths.conversion == 1,'touchpoint_id'] = df_paths.loc[df_paths.conversion == 1,'touchpoint_id'].apply(lambda x:exclude_endpoint('015000000000_tp',x))
df_paths = df_paths[df_paths['touchpoint_id'].str.len() > 0]

In [21]:
df_paths.conversion.value_counts()

0    657431
1    151335
Name: conversion, dtype: int64

In [22]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id,conversion
0,13000000003,"[009002002000_tp, 008002007000_tp, 00201100500...",0
2,13000000005,"[009002004000_tp, 008002007000_tp, 00201100500...",0
137,13000001000,"[001001001000_tp, 001001002000_tp, 00100300100...",0
138,13000001110,"[001008001001_tp, 001001001000_tp, 00100100200...",0
140,13000001123,"[001001002000_tp, 001001001000_tp, 00100700200...",0


In [23]:
df_paths['path'] = df_paths['touchpoint_id'].apply(lambda x: ",".join(x))

In [24]:
df_paths = df_paths[['mobile','path','conversion']]

In [25]:
df_paths.head()

Unnamed: 0,mobile,path,conversion
0,13000000003,"009002002000_tp,008002007000_tp,002011005003_t...",0
2,13000000005,"009002004000_tp,008002007000_tp,002011005003_t...",0
137,13000001000,"001001001000_tp,001001002000_tp,001003001002_t...",0
138,13000001110,"001008001001_tp,001001001000_tp,001001002000_t...",0
140,13000001123,"001001002000_tp,001001001000_tp,001007002004_t...",0


## 战败激活归因

In [26]:
data = {
    "path": df_paths.path.values.tolist(),
    "conversions": df_paths.conversion.values.tolist()
}

data = pd.DataFrame(data)

df = pd.DataFrame(data)

path_feature="path"
conversion_feature="conversions"
null_feature=None

separator=","
k_order=1
n_simulations=10000
max_steps=None
return_transition_probs=True
random_state=26

# instantiate the model
mm = MarkovModel(path_feature=path_feature,
                 conversion_feature=conversion_feature,
                 null_feature=null_feature,
                 separator=separator,
                 k_order=k_order,
                 n_simulations=n_simulations,
                 max_steps=max_steps,
                 return_transition_probs=return_transition_probs,
                 random_state=random_state)

# fit the model
mm.fit(df)


  num_transitions[k] /= vsm[w]


<pychattr.channel_attribution.markov.MarkovModel at 0x7f165df126d0>

In [27]:
remove_effects = mm.removal_effects_.merge(id_mapping[['touchpoint_id','touchpoint_name']],left_on = 'channel_name',right_on = 'touchpoint_id').sort_values(by = 'removal_effect',ascending = False)[['touchpoint_id','touchpoint_name','removal_effect']]
## get share in results
remove_effects['share_in_result'] = remove_effects['removal_effect'] / remove_effects.removal_effect.sum()

In [28]:
remove_effects.head(50)

Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
13,009002001000_tp,销售代表跟进邀约,0.6102,0.075272
4,014003000000_tp,同意战败申请,0.5636,0.069524
0,009002002000_tp,销售代表跟进电话,0.5491,0.067735
20,009002007000_tp,销售代表跟进其他,0.5408,0.066711
3,014004000000_tp,完全战败,0.5118,0.063134
14,009002003000_tp,销售代表跟进展厅接待,0.461,0.056867
11,001001002000_tp,线索首触-地域,0.4598,0.056719
10,001001001000_tp,线索首触-品牌,0.4519,0.055745
16,001008001001_tp,汽车之家采买,0.3065,0.037809
28,001008003001_tp,易车网采买,0.2436,0.03005


In [29]:
remove_effects.to_csv('战败激活贡献度_final_report.csv',index = False)