In [2]:
import re
import datetime
from pychattr.channel_attribution import MarkovModel
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/tp_analysis_base.csv',sep = '\t',header = None)
id_mapping = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/id_mapping.csv')

df = df.rename(columns = {0:'mobile',1:'touchpoint_id',2:'action_time'})
df = df[['mobile','action_time','touchpoint_id']]
df.head()

In [5]:
def preprocess(df):
    df.dropna(inplace = True)
    df['mobile'] = df['mobile'].astype('string')
    ##filter the uncleaned mobile
    df = df[df.mobile.str.contains('^1\d{10}$')]
    ##phone number of dealer to remove
    mobile_to_remove = pd.read_csv('mobile_to_remove.csv').drop(columns =[ 'Unnamed: 0'])
    mobile_to_remove_list = mobile_to_remove.mobile.unique()
    df = df[~df.mobile.isin(mobile_to_remove_list)]
    ##删除无效节点：005000000000_tp，004000000000_tp
    df = df[~df.touchpoint_id.isin(['005000000000_tp','004000000000_tp'])]
    return df

In [6]:
df = preprocess(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49830374 entries, 0 to 55565402
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         string
 1   action_time    object
 2   touchpoint_id  object
dtypes: object(2), string(1)
memory usage: 1.5+ GB


In [14]:
##focus on people has app behaviors
mobile_to_pick = list(df[df.touchpoint_id.str.startswith('00800200')].mobile.unique())

In [17]:
df = df[df.mobile.isin(mobile_to_pick)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13921364 entries, 27 to 55565377
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         string
 1   action_time    object
 2   touchpoint_id  object
dtypes: object(2), string(1)
memory usage: 424.8+ MB


In [18]:
# link consumer journey
df = df.sort_values(['mobile', 'action_time'],ascending=[False, True])
df_paths = df.groupby('mobile')['touchpoint_id'].aggregate(lambda x: x.tolist()).reset_index()

In [19]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id
0,13000000003,"[009002002000_tp, 008002007000_tp, 00201100500..."
1,13000000005,"[009002004000_tp, 008002007000_tp, 00201100500..."
2,13000986326,"[001008003001_tp, 009002002000_tp, 00900200700..."
3,13001103210,"[006000000000_tp, 001001002000_tp, 00100100100..."
4,13001222821,"[008002006003_tp, 002011005003_tp]"


In [20]:
def if_has_endpoint(touchpoint_id,list1):
    if touchpoint_id in list1:
        return 1
    else:
        return 0

def exclude_endpoint(touchpoint_id,list1):
    list1 = list1[:list1.index(touchpoint_id)]
    return list1

In [21]:
df_paths['if_order'] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint('011000000000_tp',x))

In [22]:
df_paths.if_order.value_counts()

0    159504
1     19616
Name: if_order, dtype: int64

In [23]:
df_paths.loc[df_paths.if_order == 1,'touchpoint_id'] = df_paths.loc[df_paths.if_order == 1,'touchpoint_id'].apply(lambda x:exclude_endpoint('011000000000_tp',x))
df_paths = df_paths[df_paths['touchpoint_id'].str.len() > 0]

In [24]:
df_paths.if_order.value_counts()

0    159504
1     17951
Name: if_order, dtype: int64

In [25]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id,if_order
0,13000000003,"[009002002000_tp, 008002007000_tp, 00201100500...",0
1,13000000005,"[009002004000_tp, 008002007000_tp, 00201100500...",0
2,13000986326,"[001008003001_tp, 009002002000_tp, 00900200700...",0
3,13001103210,"[006000000000_tp, 001001002000_tp, 00100100100...",1
4,13001222821,"[008002006003_tp, 002011005003_tp]",0


In [26]:
df_paths['path'] = df_paths['touchpoint_id'].apply(lambda x: ",".join(x))

In [27]:
df_paths = df_paths[['mobile','path','if_order']]

In [28]:
df_paths.head()

Unnamed: 0,mobile,path,if_order
0,13000000003,"009002002000_tp,008002007000_tp,002011005003_t...",0
1,13000000005,"009002004000_tp,008002007000_tp,002011005003_t...",0
2,13000986326,"001008003001_tp,009002002000_tp,009002007000_t...",0
3,13001103210,"006000000000_tp,001001002000_tp,001001001000_t...",1
4,13001222821,"008002006003_tp,002011005003_tp",0


## 根据app行为时间切割人群

In [52]:
mobile_list_202006 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-06-01')&(df.action_time < '2020-07-01')].mobile.unique())

In [53]:
len(mobile_list_202006)

2381

In [54]:
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202006)]

In [55]:
def get_attribution_score(df_paths):
    data = {
    "path": df_paths.path.values.tolist(),
    "conversions": df_paths.if_order.values.tolist()
}

    data = pd.DataFrame(data)

    df = pd.DataFrame(data)

    path_feature="path"
    conversion_feature="conversions"
    null_feature=None

    separator=","
    k_order=1
    n_simulations=10000
    max_steps=None
    return_transition_probs=True
    random_state=26

    # instantiate the model
    mm = MarkovModel(path_feature=path_feature,
                     conversion_feature=conversion_feature,
                     null_feature=null_feature,
                     separator=separator,
                     k_order=k_order,
                     n_simulations=n_simulations,
                     max_steps=max_steps,
                     return_transition_probs=return_transition_probs,
                     random_state=random_state)

    # fit the model
    mm.fit(df)
    
    remove_effects = mm.removal_effects_.merge(id_mapping[['touchpoint_id','touchpoint_name']],left_on = 'channel_name',right_on = 'touchpoint_id').sort_values(by = 'removal_effect',ascending = False)[['touchpoint_id','touchpoint_name','removal_effect']]
    ## get share in results
    remove_effects['share_in_result'] = remove_effects['removal_effect'] / remove_effects.removal_effect.sum()
    return remove_effects


In [56]:
remove_effects_202006 = get_attribution_score(target_paths)

  num_transitions[k] /= vsm[w]


In [57]:
remove_effects_202006.head()

Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
1,009002001000_tp,销售代表跟进邀约,0.8296,0.058368
7,009002002000_tp,销售代表跟进电话,0.7731,0.054393
3,009002007000_tp,销售代表跟进其他,0.7212,0.050742
4,009002003000_tp,销售代表跟进展厅接待,0.6955,0.048933
6,014003000000_tp,同意战败申请,0.6069,0.0427


In [58]:
mobile_list_202007 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-07-01')&(df.action_time < '2020-08-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202007)]
remove_effects_202007 = get_attribution_score(target_paths)
remove_effects_202007.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
29,009002001000_tp,销售代表跟进邀约,0.7276,0.039711
34,009002002000_tp,销售代表跟进电话,0.6214,0.033914
8,009002007000_tp,销售代表跟进其他,0.6078,0.033172
5,009002003000_tp,销售代表跟进展厅接待,0.5667,0.030929
23,002009003002_tp,每日福利首页浏览,0.5236,0.028577


In [59]:
mobile_list_202008 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-08-01')&(df.action_time < '2020-09-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202008)]
remove_effects_202008 = get_attribution_score(target_paths)
remove_effects_202008.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
1,009002001000_tp,销售代表跟进邀约,0.7053,0.046507
36,009002003000_tp,销售代表跟进展厅接待,0.6093,0.040177
37,009002002000_tp,销售代表跟进电话,0.5681,0.03746
42,009002007000_tp,销售代表跟进其他,0.564,0.03719
2,009002004000_tp,销售代表跟进进入展厅,0.4913,0.032396


In [60]:
mobile_list_202009 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-09-01')&(df.action_time < '2020-10-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202009)]
remove_effects_202009 = get_attribution_score(target_paths)
remove_effects_202009.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
1,009002001000_tp,销售代表跟进邀约,0.6041,0.045603
7,009002002000_tp,销售代表跟进电话,0.579,0.043709
9,009002007000_tp,销售代表跟进其他,0.534,0.040312
6,009002003000_tp,销售代表跟进展厅接待,0.5319,0.040153
10,009002004000_tp,销售代表跟进进入展厅,0.4394,0.03317


In [61]:
mobile_list_202010 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-10-01')&(df.action_time < '2020-11-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202010)]
remove_effects_202010 = get_attribution_score(target_paths)
remove_effects_202010.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
46,009002001000_tp,销售代表跟进邀约,0.656,0.036054
5,002009003002_tp,每日福利首页浏览,0.552,0.030338
4,002009001003_tp,签到获取积分,0.5347,0.029387
45,009002003000_tp,销售代表跟进展厅接待,0.5323,0.029255
1,002009001001_tp,获取积分,0.527,0.028964


In [62]:
mobile_list_202011 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-11-01')&(df.action_time < '2020-12-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202011)]
remove_effects_202011 = get_attribution_score(target_paths)
remove_effects_202011.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
4,009002001000_tp,销售代表跟进邀约,0.7416,0.035952
12,002009004001_tp,服务首页访问,0.6671,0.03234
11,002009003002_tp,每日福利首页浏览,0.6637,0.032176
13,003001004000_tp,展厅首页点击,0.6592,0.031957
18,002008003001_tp,消息中心首页访问,0.6468,0.031356


In [63]:
mobile_list_202012 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2020-12-01')&(df.action_time < '2021-01-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202012)]
remove_effects_202012 = get_attribution_score(target_paths)
remove_effects_202012.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
16,009002001000_tp,销售代表跟进邀约,0.7055,0.045569
3,002009003002_tp,每日福利首页浏览,0.5244,0.033872
20,002009004001_tp,服务首页访问,0.5213,0.033672
59,009002003000_tp,销售代表跟进展厅接待,0.5199,0.033581
15,001001002000_tp,线索首触-地域,0.5136,0.033174


In [64]:
mobile_list_202101 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2021-01-01')&(df.action_time < '2021-02-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202101)]
remove_effects_202101 = get_attribution_score(target_paths)
remove_effects_202101.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
15,009002001000_tp,销售代表跟进邀约,0.6883,0.046968
11,009002003000_tp,销售代表跟进展厅接待,0.5572,0.038022
9,001001002000_tp,线索首触-地域,0.5189,0.035408
14,002009004001_tp,服务首页访问,0.484,0.033027
19,002009003002_tp,每日福利首页浏览,0.4761,0.032488


In [65]:
mobile_list_202102 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2021-02-01')&(df.action_time < '2021-03-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202102)]
remove_effects_202102 = get_attribution_score(target_paths)
remove_effects_202102.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
15,009002001000_tp,销售代表跟进邀约,0.695,0.048036
16,009002003000_tp,销售代表跟进展厅接待,0.5201,0.035948
13,001001002000_tp,线索首触-地域,0.5027,0.034745
10,002009004001_tp,服务首页访问,0.4893,0.033819
8,002009003002_tp,每日福利首页浏览,0.4706,0.032527


In [66]:
mobile_list_202103 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2021-03-01')&(df.action_time < '2021-04-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202103)]
remove_effects_202103 = get_attribution_score(target_paths)
remove_effects_202103.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
5,009002001000_tp,销售代表跟进邀约,0.6897,0.047742
7,009002003000_tp,销售代表跟进展厅接待,0.5315,0.036791
19,002009003002_tp,每日福利首页浏览,0.4952,0.034278
26,002009004001_tp,服务首页访问,0.4845,0.033538
3,001001002000_tp,线索首触-地域,0.484,0.033503


In [67]:
mobile_list_202104 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2021-04-01')&(df.action_time < '2021-05-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202104)]
remove_effects_202104 = get_attribution_score(target_paths)
remove_effects_202104.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
38,009002001000_tp,销售代表跟进邀约,0.6678,0.049383
2,001001002000_tp,线索首触-地域,0.521,0.038527
5,009002003000_tp,销售代表跟进展厅接待,0.5091,0.037647
1,001001001000_tp,线索首触-品牌,0.4778,0.035333
7,002009004001_tp,服务首页访问,0.475,0.035126


In [68]:
mobile_list_202105 = list(df[(df.touchpoint_id.str.startswith('00800200'))&(df.action_time >= '2021-05-01')&(df.action_time < '2021-06-01')].mobile.unique())
target_paths = df_paths[df_paths.mobile.isin(mobile_list_202105)]
remove_effects_202105 = get_attribution_score(target_paths)
remove_effects_202105.head()

  num_transitions[k] /= vsm[w]


Unnamed: 0,touchpoint_id,touchpoint_name,removal_effect,share_in_result
47,009002001000_tp,销售代表跟进邀约,0.6903,0.050201
43,009002003000_tp,销售代表跟进展厅接待,0.5285,0.038435
39,001001002000_tp,线索首触-地域,0.4998,0.036348
6,002009004001_tp,服务首页访问,0.4592,0.033395
40,001001001000_tp,线索首触-品牌,0.4465,0.032471


In [85]:
remove_effects_202105.to_csv('remove_effects_202105.csv',index = False)