In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
from tensorflow.python.ops import math_ops

# 数据预处理为FeatureColumn
原始数据文档
* https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408


## 处理样本骨架特征

### Item Features	

205	Item ID.

206	Category ID to which the item belongs to.

207	Shop ID to which item belongs to.

210	Intention node ID which the item belongs to.

216	Brand ID of the item.

### Combination Features	
508	The combination of features with 109_14 and 206.

509	The combination of features with 110_14 and 207.

702	The combination of features with 127_14 and 216.

853	The combination of features with 150_14 and 210.

### Context Features	
301	A categorical expression of position.

### 训练样本

In [202]:

sample_feature_columns = ['sample_id', 'click', 'buy', 'md5', 'feature_num', 'feature_list']
sample_table = pd.read_table('./ctr_cvr_data/sample_skeleton_train_sample_2_percent.csv', 
                             sep=',', header=None, names=sample_feature_columns, engine = 'python')
#feature_field_list = ['205','206','207','210','216','508','509','702','853','301']
feature_name_list = ['ItemID','CategoryID','ShopID','NodeID','BrandID','Com_CateID',
                     'Com_ShopID','Com_BrandID','Com_NodeID','PID']
field_id_name = {'205':'ItemID',
                 '206':'CategoryID',
                 '207':'ShopID',
                 '210':'NodeID',
                 '216':'BrandID',
                 '508':'Com_CateID',
                 '509':'Com_ShopID',
                 '702':'Com_BrandID',
                 '853':'Com_NodeID',
                 '301':'PID'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in sample_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 10000 == 0:
       print("current_index:",index)

#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict,columns=feature_name_list)

#print(sample_table.columns)
#print(entire_fea_table.columns)
sample_table = sample_table.drop('feature_list',axis=1)

sample_table = pd.concat([sample_table, entire_fea_table], axis=1, join_axes=[sample_table.index])

sample_table.to_csv('./ctr_cvr_data/sampled_sample_skeleton_train_sample_feature_column.csv',index=False)
print(0)

current_index: 0
current_index: 10000
current_index: 20000
current_index: 30000
current_index: 40000
current_index: 50000
current_index: 60000
current_index: 70000
current_index: 80000
current_index: 90000
current_index: 100000
current_index: 110000
current_index: 120000
current_index: 130000
current_index: 140000
current_index: 150000
current_index: 160000
current_index: 170000
current_index: 180000
current_index: 190000
current_index: 200000
current_index: 210000
current_index: 220000
current_index: 230000
current_index: 240000
current_index: 250000
current_index: 260000
current_index: 270000
current_index: 280000
current_index: 290000
current_index: 300000
current_index: 310000
current_index: 320000
current_index: 330000
current_index: 340000
current_index: 350000
current_index: 360000
current_index: 370000
current_index: 380000
current_index: 390000
current_index: 400000
current_index: 410000
current_index: 420000
current_index: 430000
current_index: 440000
current_index: 450000
cu

### 测试集样本

In [203]:

sample_feature_columns = ['sample_id', 'click', 'buy', 'md5', 'feature_num', 'feature_list']
sample_table = pd.read_table('./ctr_cvr_data/sample_skeleton_test_sample_2_percent.csv', 
                             sep=',', header=None, names=sample_feature_columns, engine = 'python')
#feature_field_list = ['205','206','207','210','216','508','509','702','853','301']
feature_name_list = ['ItemID','CategoryID','ShopID','NodeID','BrandID','Com_CateID',
                     'Com_ShopID','Com_BrandID','Com_NodeID','PID']
field_id_name = {'205':'ItemID',
                 '206':'CategoryID',
                 '207':'ShopID',
                 '210':'NodeID',
                 '216':'BrandID',
                 '508':'Com_CateID',
                 '509':'Com_ShopID',
                 '702':'Com_BrandID',
                 '853':'Com_NodeID',
                 '301':'PID'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in sample_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 10000 == 0:
       print("current_index:",index)

#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict,columns=feature_name_list)

#print(sample_table.columns)
#print(entire_fea_table.columns)
sample_table = sample_table.drop('feature_list',axis=1)

sample_table = pd.concat([sample_table, entire_fea_table], axis=1, join_axes=[sample_table.index])

sample_table.to_csv('./ctr_cvr_data/sampled_sample_skeleton_test_sample_feature_column.csv',index=False)
print(0)

current_index: 0
current_index: 10000
current_index: 20000
current_index: 30000
current_index: 40000
current_index: 50000
current_index: 60000
current_index: 70000
current_index: 80000
current_index: 90000
current_index: 100000
current_index: 110000
current_index: 120000
current_index: 130000
current_index: 140000
current_index: 150000
current_index: 160000
current_index: 170000
current_index: 180000
current_index: 190000
current_index: 200000
current_index: 210000
current_index: 220000
current_index: 230000
current_index: 240000
current_index: 250000
current_index: 260000
current_index: 270000
current_index: 280000
current_index: 290000
current_index: 300000
current_index: 310000
current_index: 320000
current_index: 330000
current_index: 340000
current_index: 350000
current_index: 360000
current_index: 370000
current_index: 380000
current_index: 390000
current_index: 400000
current_index: 410000
current_index: 420000
current_index: 430000
current_index: 440000
current_index: 450000
cu

## 处理Common 用户特征
### User Features	
101	User ID.

109_14	User historical behaviors of category ID and count*.

110_14	User historical behaviors of shop ID and count*.

127_14	User historical behaviors of brand ID and count*.

150_14	User historical behaviors of intention node ID and count*.

121	Categorical ID of User Profile.

122	Categorical group ID of User Profile.

124	Users Gender ID.

125	Users Age ID.

126	Users Consumption Level Type I.

127	Users Consumption Level Type II.

128	Users Occupation: whether or not to work.

129	Users Geography Informations.

### 训练集common feature

In [204]:

common_table_columns = ['md5', 'feature_num', 'feature_list']
common_table = pd.read_table('./ctr_cvr_data/common_features_skeleton_train_sample_2_percent.csv', 
                                sep=',', header=None, names=common_table_columns, engine = 'python')
feature_name_list = ['UserID', 'User_CateIDs', 'User_ShopIDs', 'User_BrandIDs', 'User_NodeIDs', 'User_Cluster', 
                     'User_ClusterID', 'User_Gender', 'User_Age', 'User_Level1', 'User_Level2', 
                     'User_Occupation', 'User_Geo']
field_id_name = {'101':'UserID',
                 '109_14':'User_CateIDs',
                 '110_14':'User_ShopIDs',
                 '127_14':'User_BrandIDs',
                 '150_14':'User_NodeIDs',
                 '121':'User_Cluster',
                 '122':'User_ClusterID',
                 '124':'User_Gender',
                 '125':'User_Age',
                 '126':'User_Level1',
                 '127':'User_Level2',
                 '128':'User_Occupation',
                 '129':'User_Geo'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in common_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 1000 == 0:
       print("current_index:",index)
#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict, columns=feature_name_list)
print(entire_fea_table.shape)
print(common_table.shape)
common_table = common_table.drop('feature_list',axis=1)

common_table = pd.concat([common_table, entire_fea_table], axis=1, join_axes=[common_table.index])

common_table.to_csv('./ctr_cvr_data/sampled_common_features_skeleton_train_sample_feature_column.csv',index=False)
print(common_table.shape)

current_index: 0
current_index: 1000
current_index: 2000
current_index: 3000
current_index: 4000
current_index: 5000
current_index: 6000
current_index: 7000
current_index: 8000
current_index: 9000
current_index: 10000
current_index: 11000
current_index: 12000
current_index: 13000
current_index: 14000
current_index: 15000
current_index: 16000
current_index: 17000
current_index: 18000
(18158, 13)
(18158, 3)
(18158, 15)


### 测试集common feature

In [206]:

common_table_columns = ['md5', 'feature_num', 'feature_list']
common_table = pd.read_table('./ctr_cvr_data/common_features_skeleton_test_sample_2_percent.csv', 
                                sep=',', header=None, names=common_table_columns, engine = 'python')
feature_name_list = ['UserID', 'User_CateIDs', 'User_ShopIDs', 'User_BrandIDs', 'User_NodeIDs', 'User_Cluster', 
                     'User_ClusterID', 'User_Gender', 'User_Age', 'User_Level1', 'User_Level2', 
                     'User_Occupation', 'User_Geo']
field_id_name = {'101':'UserID',
                 '109_14':'User_CateIDs',
                 '110_14':'User_ShopIDs',
                 '127_14':'User_BrandIDs',
                 '150_14':'User_NodeIDs',
                 '121':'User_Cluster',
                 '122':'User_ClusterID',
                 '124':'User_Gender',
                 '125':'User_Age',
                 '126':'User_Level1',
                 '127':'User_Level2',
                 '128':'User_Occupation',
                 '129':'User_Geo'}
entire_fea_dict = {}
for k,v in field_id_name.items():
    entire_fea_dict[v] = []
for index, row in common_table.iterrows():
    feature_arr = row['feature_list'].split('\001')
    fea_dict = {}
    for k,v in field_id_name.items():
        fea_dict[k] = []
    for fea_kv in feature_arr:
        fea_field_id = fea_kv.split('\002')[0]
        fea_id_val = fea_kv.split('\002')[1]
        fea_id = fea_id_val.split('\003')[0]
        fea_val = fea_id_val.split('\003')[1]
        #print(fea_field_id,fea_id,fea_val)
        fea_dict[fea_field_id].append(fea_id)
    #print(fea_dict)
    for k,v in fea_dict.items():
        if len(v) == 0:
            entire_fea_dict[field_id_name[k]].append('<PAD>')
        else:
            entire_fea_dict[field_id_name[k]].append('|'.join(v))
    if index % 1000 == 0:
       print("current_index:",index)
#print(entire_fea_dict)    

entire_fea_table = pd.DataFrame(data=entire_fea_dict, columns=feature_name_list)
print(entire_fea_table.shape)
common_table = common_table.drop('feature_list',axis=1)
common_table = pd.concat([common_table, entire_fea_table], axis=1, join_axes=[common_table.index])

common_table.to_csv('./ctr_cvr_data/sampled_common_features_skeleton_test_sample_feature_column.csv',index=False)
print(0)

current_index: 0
current_index: 1000
current_index: 2000
current_index: 3000
current_index: 4000
current_index: 5000
current_index: 6000
current_index: 7000
current_index: 8000
current_index: 9000
current_index: 10000
current_index: 11000
current_index: 12000
current_index: 13000
current_index: 14000
current_index: 15000
current_index: 16000
current_index: 17000
current_index: 18000
current_index: 19000
current_index: 20000
current_index: 21000
(21866, 13)
0


In [None]:
a = np.array([1,2])
a