### 以名爵为例

#### 1.1 分析口径定义

In [None]:
'''
-- Step 1: 圈出目标分析时间段的人的电话号码
drop table if exists marketing_modeling.tmp_tp_analysis_users;
create table marketing_modeling.tmp_tp_analysis_users as
select 
    a.mobile
from marketing_modeling.dw_mg_tp_ts_all_i a
left join dtwarehouse.cdm_dim_dealer_employee_info b
on a.mobile = b.mobile
where b.mobile is null
and a.action_time between '2020-06-01' and '2021-04-30'
and a.mobile regexp '^[1][3-9][0-9]{9}$'
group by a.mobile;

-- Step 2: 圈出步骤一中圈出的人的相应时间段的行为
drop table if exists marketing_modeling.tmp_tp_analysis_users;
create table marketing_modeling.tmp_tp_convert_base as 
select 
a.*,
ROW_NUMBER() OVER (PARTITION BY a.mobile ORDER BY action_time asc) action_rank
from
(
    select 
        a.mobile, action_time, touchpoint_id    
    from marketing_modeling.dw_mg_tp_ts_all_i a
    left join marketing_modeling.tmp_tp_analysis_users b
    on a.mobile = b.mobile
    where b.mobile is not null
    and touchpoint_id is not null and action_time is not null
    and action_time <= '2021-06-20' and action_time >= '2020-01-01'
    group by a.mobile, action_time, touchpoint_id
) a

-- Step 3: 考虑到运算速度，我们会把step2中生成的表拉取到本地存成csv，供后续的分析使用
hive -e "select  * from marketing_modeling.tmp_tp_convert_base" > tp_analysis_base.csv
'''

#### 1.2 触点覆盖

In [None]:
# 读取数据
names = ['mobile','action_time','touchpoint_id','action_rank']
df = pd.read_csv('tp_analysis_base.csv',sep = '\t', names = names)
df['mobile'] = df['mobile'].astype(str)

In [None]:
# id mapping, 获取触点名字
id_mapping = pd.read_csv('data/id_mapping.csv')
tp_name = {k:v for k,v in zip(id_mapping.touchpoint_id,id_mapping.touchpoint_name)}

df = df.merge(id_mapping, on = 'touchpoint_id', how = 'left')

##### 1.2.1 用户触点总数

In [None]:
# 先对每个用户经过的触点进行去重，之后计算每个用户经过的触点数量，此处不对触点行为时间做任何的限制
cov_res = df[['mobile','touchpoint_id','touchpoint_name']].drop_duplicates()\
.groupby(by=['mobile'],as_index=False).agg({'touchpoint_id':'count'})

# 按以下节点进行分箱
bins = [-1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,1000]
cov_res['tp_vol'] = pd.cut(cov_res['touchpoint_id'],bins = bins)
cov_res['tp_vol'] = cov_res['tp_vol'].astype(str)

# 输出结果
print(cov_res['tp_vol'].value_counts())

##### 1.2.2 人均触点总数、次数 —— By各触点大类

In [None]:
# 人均触点种数
tp_cnt_dp = df[['mobile','touchpoint_id','level_1']].drop_duplicates()\
.groupby(by=['level_1'],as_index=False).agg({'touchpoint_id':'count'}).rename(columns= {'touchpoint_id':'uv'})

tp_cnt_dp['tp_name'] = tp_cnt_dp.level_1.apply(lambda x:tp_name[x])
tp_cnt_dp['avg_tp_type_cnt'] = tp_cnt_dp.uv / 2982100 #除以总人数

# 人均触点次数
tp_cnt = df[['mobile','touchpoint_id','level_1']]\
.groupby(by=['level_1'],as_index=False).agg({'touchpoint_id':'count'}).rename(columns= {'touchpoint_id':'pv'})

tp_cnt['tp_name'] = tp_cnt.level_1.apply(lambda x:tp_name[x])
tp_cnt['avg_tp_freq_cnt'] = tp_cnt.pv / 2982100 #除以总人数

##### 1.2.3 第一至四层级的Top15高“覆盖度”、“活跃度”触点

In [None]:
def cov_and_act(group_name,user_cnt):
    # UV
    uv_df = df[['mobile'] + [group_name]].drop_duplicates()\
    .groupby(by=[group_name],as_index=False).agg({'mobile':'count'}).rename(columns={'mobile':'uv'})

    # PV
    pv_df = df[['mobile'] + [group_name]]\
    .groupby(by=[group_name],as_index=False).agg({'mobile':'count'}).rename(columns={'mobile':'pv'})
    
    final_df = uv_df.merge(pv_df, on = group_name, how = 'left')
    final_df['activity'] = final_df['pv'] * 1.0 / final_df['uv']
    final_df['coverage'] = final_df['uv'] * 1.0 / user_cnt
    
    final_df.columns = ['touchpoint_id','uv','pv','activity','coverage']
    final_df['touchpoint_name'] = final_df['touchpoint_id'].apply(lambda x:tp_name[x])
    
    return final_df

cov_df = pd.DataFrame()
for name in ['level_1','level_2','level_3','level_4']:
    print('processing: ',name)
    tmp_res = cov_and_act(name,2982100) #除以总人数
    cov_df = cov_df.append(tmp_res)
    
print(cov_df)