In [1]:
##import packages
import re
import datetime
from pychattr.channel_attribution import MarkovModel
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

  """


In [4]:
##load the data
df = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/tp_analysis_base.csv',sep = '\t',header = None)
id_mapping = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/attribution/tp_analysis/data/id_mapping.csv')

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,13000000133,002011005004_tp,2020-10-28 11:16:45,1,,,,
1,13000001122,009002001000_tp,2020-09-11 11:08:51,17,2020-09-10 16:42:19,009002003000_tp,17.0,-1.0
2,13000009999,009002001000_tp,2020-06-17 21:51:59,25,2020-06-12 22:11:50,009002001000_tp,25.0,-5.0
3,13001035033,002009003001_tp,2020-06-18 13:33:55,24,2020-06-18 13:33:32,002009001003_tp,24.0,0.0
4,13001055006,009002001000_tp,2020-11-02 09:38:09,10,2020-10-28 10:46:06,002011005004_tp,10.0,-5.0


In [6]:
df = df.rename(columns = {0:'mobile',1:'touchpoint_id',2:'action_time'})

In [7]:
df = df[['mobile','action_time','touchpoint_id']]
df.head()

Unnamed: 0,mobile,action_time,touchpoint_id
0,13000000133,2020-10-28 11:16:45,002011005004_tp
1,13000001122,2020-09-11 11:08:51,009002001000_tp
2,13000009999,2020-06-17 21:51:59,009002001000_tp
3,13001035033,2020-06-18 13:33:55,002009003001_tp
4,13001055006,2020-11-02 09:38:09,009002001000_tp


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55565405 entries, 0 to 55565404
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         int64 
 1   action_time    object
 2   touchpoint_id  object
dtypes: int64(1), object(2)
memory usage: 1.2+ GB


In [9]:
##preprocess data
def preprocess(df):
    df.dropna(inplace = True)
    df['mobile'] = df['mobile'].astype('string')
    ##filter the uncleaned mobile
    df = df[df.mobile.str.contains('^1\d{10}$')]
    ##phone number of dealer to remove
    mobile_to_remove = pd.read_csv('mobile_to_remove.csv').drop(columns =[ 'Unnamed: 0'])
    mobile_to_remove_list = mobile_to_remove.mobile.unique()
    df = df[~df.mobile.isin(mobile_to_remove_list)]
    ##删除无效节点：005000000000_tp，004000000000_tp
    df = df[~df.touchpoint_id.isin(['005000000000_tp','004000000000_tp'])]
    return df

In [10]:
df = preprocess(df)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49830374 entries, 0 to 55565402
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   mobile         string
 1   action_time    object
 2   touchpoint_id  object
dtypes: object(2), string(1)
memory usage: 1.5+ GB


In [29]:
# link consumer journey
df = df.sort_values(['mobile', 'action_time'],ascending=[False, True])
df_paths = df.groupby('mobile')['touchpoint_id'].aggregate(lambda x: x.tolist()).reset_index()

In [30]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id
0,13000000003,"[009002002000_tp, 008002007000_tp, 002011005003_tp, 014004000000_tp, 014003000000_tp, 009002002000_tp, 002011005001_tp, 002011005001_tp, 002011005004_tp, 002011002000_tp, 002011002000_tp, 002011001001_tp, 002011005004_tp, 002011005004_tp]"
1,13000000004,"[002011002000_tp, 002011002000_tp]"
2,13000000005,"[009002004000_tp, 008002007000_tp, 002011005003_tp, 009002002000_tp, 014003000000_tp, 014004000000_tp, 002011005001_tp]"
3,13000000008,"[002011005001_tp, 002011005004_tp, 002011004000_tp]"
4,13000000010,"[002011005001_tp, 002011005003_tp, 002011005001_tp]"


In [31]:
def if_has_endpoint(touchpoint_id,list1):
    if touchpoint_id in list1:
        return 1
    else:
        return 0

def exclude_endpoint(touchpoint_id,list1):
    if touchpoint_id in list1:
        list1 = list1[:list1.index(touchpoint_id)]
    return list1

In [32]:
##只保留战败用户
df_paths['if_ever_failed'] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint('014004000000_tp',x))

In [33]:
# df_paths.if_ever_failed.value_counts()

In [34]:
df_paths = df_paths[df_paths.if_ever_failed == 1][['mobile','touchpoint_id']]
df_paths.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 808766 entries, 0 to 3587809
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   mobile         808766 non-null  object
 1   touchpoint_id  808766 non-null  object
dtypes: object(2)
memory usage: 18.5+ MB


In [35]:
##产生战败唤醒的标签
df_paths['if_activated'] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint('015000000000_tp',x))
df_paths.if_activated.value_counts()

0    657431
1    151335
Name: if_activated, dtype: int64

In [36]:
##只保留目标节点之前的触点并且删除无效路径
df_paths.loc[df_paths.if_activated == 1,'touchpoint_id'] = df_paths.loc[df_paths.if_activated == 1,'touchpoint_id'].apply(lambda x:exclude_endpoint('015000000000_tp',x))
df_paths = df_paths[df_paths['touchpoint_id'].str.len() > 0]

In [37]:
df_paths.if_activated.value_counts()

0    657431
1    151335
Name: if_activated, dtype: int64

In [38]:
df_paths.head()

Unnamed: 0,mobile,touchpoint_id,if_activated
0,13000000003,"[009002002000_tp, 008002007000_tp, 002011005003_tp, 014004000000_tp, 014003000000_tp, 009002002000_tp, 002011005001_tp, 002011005001_tp, 002011005004_tp, 002011002000_tp, 002011002000_tp, 002011001001_tp, 002011005004_tp, 002011005004_tp]",0
2,13000000005,"[009002004000_tp, 008002007000_tp, 002011005003_tp, 009002002000_tp, 014003000000_tp, 014004000000_tp, 002011005001_tp]",0
137,13000001000,"[001001001000_tp, 001001002000_tp, 001003001002_tp, 009002001000_tp, 014003000000_tp, 014004000000_tp, 009002003000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp, 002011004000_tp]",0
138,13000001110,"[001008001001_tp, 001001001000_tp, 001001002000_tp, 009002001000_tp, 009002001000_tp, 009002001000_tp, 009002002000_tp, 014003000000_tp, 014003000000_tp, 009002003000_tp, 014004000000_tp, 009002002000_tp, 014003000000_tp, 002011005001_tp]",0
140,13000001123,"[001001002000_tp, 001001001000_tp, 001007002004_tp, 009002001000_tp, 001007002002_tp, 009002001000_tp, 009002003000_tp, 014004000000_tp, 014003000000_tp, 002011005001_tp, 002011004000_tp, 002011004000_tp]",0


In [39]:
## 为每个触点产生是否存在的特征
touchpoint_list = df.touchpoint_id.unique()
for i in touchpoint_list:
    tmp = '{}'.format(i)
    df_paths[tmp] = df_paths['touchpoint_id'].apply(lambda x:if_has_endpoint(i,x))

In [46]:
X = df_paths.drop(columns = ['mobile','touchpoint_id','if_activated'])
y = df_paths['if_activated']

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,random_state = 20)

In [49]:
clf = RandomForestClassifier(n_estimators = 20,verbose=3,n_jobs=-1,random_state = 20)
clf.fit(X_train,y_train)

RandomForestClassifier(random_state=20)

In [50]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94    164358
           1       0.79      0.57      0.66     37834

    accuracy                           0.89    202192
   macro avg       0.85      0.77      0.80    202192
weighted avg       0.89      0.89      0.88    202192



In [53]:
#Create a DataFrame using a Dictionary
data={'feature_names':X_train.columns,'feature_importance':clf.feature_importances_}
fi_importance = pd.DataFrame(data).sort_values(by=['feature_importance'], ascending=False)


In [54]:
fi_importance.head()

Unnamed: 0,feature_names,feature_importance
12,002011005001_tp,0.266066
14,001006002005_tp,0.109046
11,009002001000_tp,0.035057
8,002011005004_tp,0.029556
16,002011004000_tp,0.026174


In [55]:
fi_importance = fi_importance.merge(id_mapping[['touchpoint_id','touchpoint_name']], left_on = 'feature_names',right_on = 'touchpoint_id')
fi_importance = fi_importance[['touchpoint_id','touchpoint_name','feature_importance']]
fi_importance.head()

Unnamed: 0,touchpoint_id,touchpoint_name,feature_importance
0,002011005001_tp,收到短信活动主题,0.266066
1,001006002005_tp,金线索,0.109046
2,009002001000_tp,销售代表跟进邀约,0.035057
3,002011005004_tp,收到短信推介主题,0.029556
4,002011004000_tp,AI外呼未接通,0.026174


In [56]:
fi_importance.to_csv('feature_importance_fail_final_report.csv',index = False)