In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
from tensorflow.python.ops import math_ops

# 数据预处理为FeatureColumn
原始数据文档
* https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408


## 处理样本骨架特征

### Item Features	

205	Item ID.

206	Category ID to which the item belongs to.

207	Shop ID to which item belongs to.

210	Intention node ID which the item belongs to.

216	Brand ID of the item.

### Combination Features	
508	The combination of features with 109_14 and 206.

509	The combination of features with 110_14 and 207.

702	The combination of features with 127_14 and 216.

853	The combination of features with 150_14 and 210.

### Context Features	
301	A categorical expression of position.

### 训练样本

In [6]:

sample_feature_columns = ['sample_id', 'click', 'buy', 'md5', 'feature_num', 'feature_list']
sample_table = pd.read_csv('./ctr_cvr_data/BuyWeight_sample_skeleton_train_sample_2_percent.csv', 
                             sep=',', header=None, names=sample_feature_columns, engine = 'python')
#feature_field_list = ['205','206','207','210','216','508','509','702','853','301']
feature_name_list = ['ItemID','CategoryID','ShopID','NodeID','BrandID','Com_CateID',
                     'Com_ShopID','Com_BrandID','Com_NodeID','PID']
field_id_name = {'205':'ItemID',
                 '206':'CategoryID',
                 '207':'ShopID',
                 '210':'NodeID',
                 '216':'BrandID',
                 '508':'Com_CateID',
                 '509':'Com_ShopID',
                 '702':'Com_BrandID',
                 '853':'Com_NodeID',
                 '301':'PID'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in sample_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 10000 == 0:
       print("current_index:",index)

#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict,columns=feature_name_list)

#print(sample_table.columns)
#print(entire_fea_table.columns)
sample_table = sample_table.drop('feature_list',axis=1)

sample_table = pd.concat([sample_table, entire_fea_table], axis=1, join_axes=[sample_table.index])

sample_table.to_csv('./ctr_cvr_data/BuyWeight_sampled_sample_skeleton_train_sample_feature_column.csv',index=False)
print(0)

0


### 测试集样本

In [8]:

sample_feature_columns = ['sample_id', 'click', 'buy', 'md5', 'feature_num', 'feature_list']
sample_table = pd.read_table('./ctr_cvr_data/BuyWeight_sample_skeleton_test_sample_2_percent.csv', 
                             sep=',', header=None, names=sample_feature_columns, engine = 'python')
#feature_field_list = ['205','206','207','210','216','508','509','702','853','301']
feature_name_list = ['ItemID','CategoryID','ShopID','NodeID','BrandID','Com_CateID',
                     'Com_ShopID','Com_BrandID','Com_NodeID','PID']
field_id_name = {'205':'ItemID',
                 '206':'CategoryID',
                 '207':'ShopID',
                 '210':'NodeID',
                 '216':'BrandID',
                 '508':'Com_CateID',
                 '509':'Com_ShopID',
                 '702':'Com_BrandID',
                 '853':'Com_NodeID',
                 '301':'PID'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in sample_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 10000 == 0:
       print("current_index:",index)

#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict,columns=feature_name_list)

#print(sample_table.columns)
#print(entire_fea_table.columns)
sample_table = sample_table.drop('feature_list',axis=1)

sample_table = pd.concat([sample_table, entire_fea_table], axis=1, join_axes=[sample_table.index])

sample_table.to_csv('./ctr_cvr_data/BuyWeight_sampled_sample_skeleton_test_sample_feature_column.csv',index=False)
print(0)

current_index: 0
current_index: 10000
current_index: 20000
current_index: 30000
current_index: 40000
current_index: 50000
current_index: 60000
current_index: 70000
current_index: 80000
current_index: 90000
current_index: 100000
current_index: 110000
current_index: 120000
current_index: 130000
current_index: 140000
current_index: 150000
current_index: 160000
current_index: 170000
current_index: 180000
current_index: 190000
current_index: 200000
current_index: 210000
current_index: 220000
current_index: 230000
current_index: 240000
current_index: 250000
current_index: 260000
current_index: 270000
current_index: 280000
current_index: 290000
current_index: 300000
current_index: 310000
current_index: 320000
current_index: 330000
current_index: 340000
current_index: 350000
current_index: 360000
current_index: 370000
current_index: 380000
current_index: 390000
current_index: 400000
current_index: 410000
current_index: 420000
current_index: 430000
current_index: 440000
current_index: 450000
cu

## 处理Common 用户特征
### User Features	
101	User ID.

109_14	User historical behaviors of category ID and count*.

110_14	User historical behaviors of shop ID and count*.

127_14	User historical behaviors of brand ID and count*.

150_14	User historical behaviors of intention node ID and count*.

121	Categorical ID of User Profile.

122	Categorical group ID of User Profile.

124	Users Gender ID.

125	Users Age ID.

126	Users Consumption Level Type I.

127	Users Consumption Level Type II.

128	Users Occupation: whether or not to work.

129	Users Geography Informations.

### 训练集common feature

In [1]:
black_list = set(['109_14','110_14','127_14','150_14'])


In [7]:

common_table_columns = ['md5', 'feature_num', 'feature_list']
common_table = pd.read_table('./ctr_cvr_data/BuyWeight_common_features_skeleton_train_sample_2_percent.csv', 
                                sep=',', header=None, names=common_table_columns, engine = 'python')
feature_name_list = ['UserID', 'User_CateIDs', 'User_ShopIDs', 'User_BrandIDs', 'User_NodeIDs', 'User_Cluster', 
                     'User_ClusterID', 'User_Gender', 'User_Age', 'User_Level1', 'User_Level2', 
                     'User_Occupation', 'User_Geo']
field_id_name = {'101':'UserID',
                 '109_14':'User_CateIDs',
                 '110_14':'User_ShopIDs',
                 '127_14':'User_BrandIDs',
                 '150_14':'User_NodeIDs',
                 '121':'User_Cluster',
                 '122':'User_ClusterID',
                 '124':'User_Gender',
                 '125':'User_Age',
                 '126':'User_Level1',
                 '127':'User_Level2',
                 '128':'User_Occupation',
                 '129':'User_Geo'}

#black_list = set(['109_14','110_14','127_14','150_14'])
black_list = set(['110_14','150_14'])
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in common_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        if fea_field_id in black_list:
            continue
        # Multi-Hot IDs类特征保留前100个ID
        if len(fea_dict[fea_field_id]) < 100:
            fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 1000 == 0:
       print("current_index:",index)
#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict, columns=feature_name_list)
print(entire_fea_table.shape)
print(common_table.shape)
common_table = common_table.drop('feature_list',axis=1)

common_table = pd.concat([common_table, entire_fea_table], axis=1, join_axes=[common_table.index])

common_table.to_csv('./ctr_cvr_data/BuyWeight_sampled_common_features_skeleton_train_sample_feature_column.csv',index=False)
print(common_table.shape)

current_index: 0
current_index: 1000
current_index: 2000
current_index: 3000
current_index: 4000
current_index: 5000
current_index: 6000
current_index: 7000
current_index: 8000
current_index: 9000
current_index: 10000
current_index: 11000
current_index: 12000
current_index: 13000
current_index: 14000
current_index: 15000
current_index: 16000
current_index: 17000
current_index: 18000
current_index: 19000
current_index: 20000
current_index: 21000
current_index: 22000
current_index: 23000
current_index: 24000
current_index: 25000
current_index: 26000
current_index: 27000
current_index: 28000
current_index: 29000
current_index: 30000
current_index: 31000
current_index: 32000
current_index: 33000
current_index: 34000
current_index: 35000
current_index: 36000
current_index: 37000
current_index: 38000
current_index: 39000
current_index: 40000
current_index: 41000
current_index: 42000
current_index: 43000
current_index: 44000
current_index: 45000
current_index: 46000
current_index: 47000
curre

current_index: 378000
current_index: 379000
current_index: 380000
current_index: 381000
current_index: 382000
current_index: 383000
current_index: 384000
current_index: 385000
current_index: 386000
current_index: 387000
current_index: 388000
current_index: 389000
current_index: 390000
current_index: 391000
current_index: 392000
current_index: 393000
current_index: 394000
current_index: 395000
current_index: 396000
current_index: 397000
current_index: 398000
current_index: 399000
current_index: 400000
current_index: 401000
current_index: 402000
current_index: 403000
current_index: 404000
current_index: 405000
current_index: 406000
current_index: 407000
current_index: 408000
current_index: 409000
current_index: 410000
current_index: 411000
current_index: 412000
current_index: 413000
current_index: 414000
current_index: 415000
current_index: 416000
current_index: 417000
current_index: 418000
current_index: 419000
current_index: 420000
current_index: 421000
current_index: 422000
current_in

In [8]:
common_table.head()

Unnamed: 0,md5,feature_num,UserID,User_CateIDs,User_ShopIDs,User_BrandIDs,User_NodeIDs,User_Cluster,User_ClusterID,User_Gender,User_Age,User_Level1,User_Level2,User_Occupation,User_Geo
0,84dceed2e3a667f8,343,31319,450877|447414|446442|450989|451636|449082|4572...,<PAD>,3781041|3850935|3850235|3638768|3858194|359279...,<PAD>,3438687,3438762,3438769,3438774,3438779,3438782,3864885,3864888
1,0000350f0c2121e7,811,392326,447553|445995|450247|449070|450980|445135|4454...,<PAD>,3716224|3514627|3772871|3543283|3728186|371080...,<PAD>,3438725,3438760,3438769,3438772,3438778,3438782,3864885,3864888
2,000091a89d1867ab,7,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>,3438658,3438761,3438769,3438773,<PAD>,3438781,3864885,3864889
3,0001fa8246be0940,374,407969,451311|450954|450462|451530|451099|450656|4490...,<PAD>,3504052|3507496|3622158|3630324|3566530|352097...,<PAD>,3438737,3438757,3438768,3438774,3438778,3438782,3864885,3864888
4,000260b23f85aadb,266,168295,450837|451033|450838|449949|455349|455827|4553...,<PAD>,3627914|3760360|3763560|3496527|3689932|384471...,<PAD>,3438705,3438765,3438768,3438771,3438777,3438782,3864886,3864888


### 测试集common feature

In [9]:

common_table_columns = ['md5', 'feature_num', 'feature_list']
common_table = pd.read_table('./ctr_cvr_data/BuyWeight_common_features_skeleton_test_sample_2_percent.csv', 
                                sep=',', header=None, names=common_table_columns, engine = 'python')
feature_name_list = ['UserID', 'User_CateIDs', 'User_ShopIDs', 'User_BrandIDs', 'User_NodeIDs', 'User_Cluster', 
                     'User_ClusterID', 'User_Gender', 'User_Age', 'User_Level1', 'User_Level2', 
                     'User_Occupation', 'User_Geo']
field_id_name = {'101':'UserID',
                 '109_14':'User_CateIDs',
                 '110_14':'User_ShopIDs',
                 '127_14':'User_BrandIDs',
                 '150_14':'User_NodeIDs',
                 '121':'User_Cluster',
                 '122':'User_ClusterID',
                 '124':'User_Gender',
                 '125':'User_Age',
                 '126':'User_Level1',
                 '127':'User_Level2',
                 '128':'User_Occupation',
                 '129':'User_Geo'}
# 为了减少内存占用，方便单机版运行，先去掉Multi-hot特征
#black_list = set(['109_14','110_14','127_14','150_14'])
black_list = set(['110_14','150_14'])
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in common_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        if fea_field_id in black_list:
            continue
        # Multi-Hot IDs类特征保留前100个ID
        if len(fea_dict[fea_field_id]) < 100:
            fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 1000 == 0:
       print("current_index:",index)
#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict, columns=feature_name_list)
print(entire_fea_table.shape)
common_table = common_table.drop('feature_list',axis=1)
common_table = pd.concat([common_table, entire_fea_table], axis=1, join_axes=[common_table.index])

common_table.to_csv('./ctr_cvr_data/BuyWeight_sampled_common_features_skeleton_test_sample_feature_column.csv',index=False)
print(0)

current_index: 0
current_index: 1000
current_index: 2000
current_index: 3000
current_index: 4000
current_index: 5000
current_index: 6000
current_index: 7000
current_index: 8000
current_index: 9000
current_index: 10000
current_index: 11000
current_index: 12000
current_index: 13000
current_index: 14000
current_index: 15000
current_index: 16000
current_index: 17000
current_index: 18000
current_index: 19000
current_index: 20000
current_index: 21000
current_index: 22000
current_index: 23000
current_index: 24000
current_index: 25000
current_index: 26000
current_index: 27000
current_index: 28000
current_index: 29000
current_index: 30000
current_index: 31000
current_index: 32000
current_index: 33000
current_index: 34000
current_index: 35000
current_index: 36000
current_index: 37000
current_index: 38000
current_index: 39000
current_index: 40000
current_index: 41000
current_index: 42000
current_index: 43000
current_index: 44000
current_index: 45000
current_index: 46000
current_index: 47000
curre

current_index: 378000
current_index: 379000
current_index: 380000
current_index: 381000
current_index: 382000
current_index: 383000
current_index: 384000
current_index: 385000
current_index: 386000
current_index: 387000
current_index: 388000
current_index: 389000
current_index: 390000
current_index: 391000
current_index: 392000
current_index: 393000
current_index: 394000
current_index: 395000
current_index: 396000
current_index: 397000
current_index: 398000
current_index: 399000
current_index: 400000
current_index: 401000
current_index: 402000
current_index: 403000
current_index: 404000
current_index: 405000
current_index: 406000
current_index: 407000
current_index: 408000
current_index: 409000
current_index: 410000
current_index: 411000
current_index: 412000
current_index: 413000
current_index: 414000
current_index: 415000
current_index: 416000
current_index: 417000
current_index: 418000
current_index: 419000
current_index: 420000
current_index: 421000
current_index: 422000
current_in

In [10]:
common_table.head()

Unnamed: 0,md5,feature_num,UserID,User_CateIDs,User_ShopIDs,User_BrandIDs,User_NodeIDs,User_Cluster,User_ClusterID,User_Gender,User_Age,User_Level1,User_Level2,User_Occupation,User_Geo
0,0010d0b9633bb5b0,250,66015,455028|451998|451100|445269|445990|450099|4557...,<PAD>,3520924|3505215|3588720|3541711|3801132|382945...,<PAD>,3438670,3438756,3438769,3438771,3438777,3438782,3864886,3864889
1,0012aad1f55312b6,170,121803,451822|451095|449537|449301|455342|449077|4490...,<PAD>,3518975|3697970|3784310|3821497|3698768|345218...,<PAD>,3438658,3438766,3438768,3438772,<PAD>,3438782,3864885,3864889
2,0013e5c24e8dd3a6,617,135732,452511|450721|449079|450276|450656|449078|4493...,<PAD>,3707605|3632935|3809314|3703188|3700287|356905...,<PAD>,3438670,3438756,3438769,3438771,3438777,3438782,3864885,3864889
3,001459b610a7c186,395,235356,450880|456589|450870|450658|451641|451639|4508...,<PAD>,3765068|3620179|3619511|3668302|3610026|378643...,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>,<PAD>
4,001efa0ef1001dd1,849,131668,445600|450229|449070|449317|450658|449178|4509...,<PAD>,3628671|3853135|3805119|3848839|3651862|372880...,<PAD>,3438685,3438762,3438769,3438774,3438778,3438782,3864885,3864888


In [2]:
a = np.array([1,2])
a

array([1, 2])