In [1]:
import pandas as pd
import numpy as np
import os
import sys
import re
import gc
import time
import warnings
warnings.filterwarnings("ignore")
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from collections import defaultdict 
from tqdm import tqdm
from common_val import *
from common_utils import *

### 公共函数

In [3]:
def feature_list_split(x):
    feature_dict = defaultdict(list)
    for fea in x.split('\x01'):
        field = re.split('\x02', fea)
        #fea = re.split('\x03', field[1])
        #feature_dict[field[0]].append({'feature_id':fea[0], 'value':fea[1]})
        feature_dict[field[0]].append(field[1])
    return feature_dict

def mul_list_to_dict(x):
    feature_dict = dict()
    for item in x:
        item = item.split('\x03')
        
        feature_dict[item[0]] = item[1]
    return feature_dict

In [4]:
user_features = ['101','121','122','124','125','126','127', '128', '129', '150_14', '127_14', '109_14', '110_14']
item_features = ['205','206','207', '210','216','508','509', '702', '853', '301']

fea_cols = ['common_feature_index', 'feature_num2', 'feature_list2']
sample_cols = ['sample_id', 'click', 'conversion', 'common_feature_index', 'feature_num1', 'feature_list1']

## 公共特征预处理

In [5]:
def user_fea_fn(data):
    data['fea_dict2'] =  data['feature_list2'].map(feature_list_split)
    data['userid'] = data['fea_dict2'].map(lambda x: int(x['101'][0].split('\x03')[0]) if '101' in x else 0) # 用户ID
    data['usercate1'] = data['fea_dict2'].map(lambda x: int(x['121'][0].split('\x03')[0]) if '121' in x else 0) # 用户的一种分类ID
    data['usercate2'] = data['fea_dict2'].map(lambda x: int(x['122'][0].split('\x03')[0]) if '122' in x else 0) # 用户的一种分类ID
    data['usercate1'] = data['usercate1'].map(lambda x: x-3438658+1 if x!=0 else x)
    data['usercate2'] = data['usercate2'].map(lambda x: x-3438755+1 if x!=0 else x)
    
    data['gender'] = data['fea_dict2'].map(lambda x: int(x['124'][0].split('\x03')[0]) if '124' in x else 0) # 用户性别分类ID
    data['gender'] = data['gender'].map({3438769:2, 3438768:1, 0:0})

    data['age'] = data['fea_dict2'].map(lambda x: int(x['125'][0].split('\x03')[0]) if '125' in x else 0) # 用户年龄分类ID
    data['age'] = data['age'].map(lambda x: x-3438770+1 if x!=0 else x)
    
    data['user_consume1'] = data['fea_dict2'].map(lambda x: int(x['126'][0].split('\x03')[0]) if '126' in x else 0) # 用户消费水平分类I    
    data['user_consume2'] = data['fea_dict2'].map(lambda x: int(x['127'][0].split('\x03')[0]) if '127' in x else 0) # 用户消费水平分类II
    data['user_consume1'] = data['user_consume1'].map({3438777:1, 3438778:2, 3438779:3, 0:0})
    data['user_consume2'] = data['user_consume2'].map({3438780:1, 3438781:2, 3438782:3, 0:0})
    
    data['work'] = data['fea_dict2'].map(lambda x: int(x['128'][0].split('\x03')[0]) if '128' in x else 0) # 用户是否就业
    data['work'] = data['work'].map({3864885:1, 3864886:2, 0:0})
    
    data['location'] = data['fea_dict2'].map(lambda x: int(x['129'][0].split('\x03')[0]) if '129' in x else 0) # 用户地理信息分类ID
    data['location'] = data['location'].map({3864887:1, 3864888:2, 3864889:3, 3864890:4, 0:0})
    
#     data['user_intention_node_count'] = data['fea_dict2'].map(lambda x: mul_list_to_dict(x['150_14']) if '150_14' in x else {}) # 用户意图ID以及用户在该意图上的历史行为累积数量
#     data['user_shop_brand_count'] = data['fea_dict2'].map(lambda x: mul_list_to_dict(x['127_14']) if '127_14' in x else {}) # 商品品牌ID以及用户在该店铺上的历史行为累积数量*
#     data['user_shop_cate_count'] = data['fea_dict2'].map(lambda x: mul_list_to_dict(x['109_14']) if '109_14' in x else {}) # 商品类目ID以及用户在该类目上的历史行为累积数量*
#     data['user_shop_count'] = data['fea_dict2'].map(lambda x: mul_list_to_dict(x['110_14']) if '110_14' in x else {}) # 商品店铺ID以及用户在该店铺上的历史行为累积数量*
    
    data['user_intention_count'] = data['fea_dict2'].map(lambda x: x['150_14']) # 用户意图ID以及用户在该意图上的历史行为累积数量
    data['user_brand_count'] = data['fea_dict2'].map(lambda x: x['127_14']) # 商品品牌ID以及用户在该店铺上的历史行为累积数量*
    data['user_cate_count'] = data['fea_dict2'].map(lambda x: x['109_14']) # 商品类目ID以及用户在该类目上的历史行为累积数量*
    data['user_shop_count'] = data['fea_dict2'].map(lambda x: x['110_14']) # 商品店铺ID以及用户在该店铺上的历史行为累积数量*
    
    # 后续字段类型转换
    data['usercate1'] = data['usercate1'].astype('category')
    data['usercate2'] = data['usercate2'].astype('category')
    data['gender'] = data['gender'].astype('category')
    data['age'] = data['age'].astype('category')
    data['user_consume1'] = data['user_consume1'].astype('category')
    data['user_consume2'] = data['user_consume2'].astype('category')
    data['work'] = data['work'].astype('category')
    data['location'] = data['location'].astype('category')
    data.drop(columns=['feature_num2','feature_list2', 'fea_dict2'], inplace=True)

In [None]:
fea_train = pd.read_csv(common_features_train_csv, header=None, names=fea_cols, iterator=True,chunksize = 100000)
fea_test = pd.read_csv(common_features_test_csv, header=None, names=fea_cols, iterator=True,chunksize = 100000)

###  内存大，一次性读入处理

In [6]:
fea_train = pd.read_csv(common_features_train_csv, header=None, names=fea_cols)
'read done'
user_fea_fn(fea_train)
'done!'
fea_train.to_pickle(os.path.join(data_path2, f'fea_train.pkl'))

'read done'

'done!'

In [6]:
fea_test = pd.read_csv(common_features_test_csv, header=None, names=fea_cols)
'read done'
user_fea_fn(fea_test)
'done!'
fea_test.to_pickle(os.path.join(data_path2, f'fea_test.pkl'))

'read done'

'done!'

###  内存小，一次性读入处理

In [None]:
i = 0
for chunk_df in fea_train:
    print(i,chunk_df.shape)
    user_fea_fn(chunk_df)
    chunk_df.to_pickle(os.path.join(data_path2, f'fea_train{i}.pkl'))
    i = i + 1

In [7]:
i = 0
for chunk_df in fea_test:
    print(i,chunk_df.shape)
    user_fea_fn(chunk_df)
    chunk_df.to_pickle(os.path.join(data_path2, f'fea_test{i}.pkl'))
    i = i + 1

0 (100000, 3)
1 (100000, 3)
2 (100000, 3)
3 (100000, 3)
4 (100000, 3)
5 (100000, 3)
6 (100000, 3)
7 (100000, 3)
8 (84212, 3)


## 样本特征处理

In [5]:
sample_train = pd.read_csv(sample_skeleton_train_csv, header=None, names=sample_cols, iterator=True,chunksize = 2500000)
sample_test = pd.read_csv(sample_skeleton_test_csv, header=None, names=sample_cols, iterator=True,chunksize = 2500000)

In [6]:
def item_fea_fn(data):
    data['fea_dict1'] =  data['feature_list1'].map(feature_list_split)
    
    data['itemid'] = data['fea_dict1'].map(lambda x: int(x['205'][0].split('\x03')[0]) if '205' in x else 0) # 商品ID
    data['item_cate'] = data['fea_dict1'].map(lambda x: int(x['206'][0].split('\x03')[0]) if '206' in x else 0) # 商品所属类目ID
    data['shopid'] = data['fea_dict1'].map(lambda x: int(x['207'][0].split('\x03')[0]) if '207' in x else 0) # 商品所属店铺ID
    data['brandid'] = data['fea_dict1'].map(lambda x: int(x['216'][0].split('\x03')[0]) if '216' in x else 0) # 商品的品牌ID
    data['business'] = data['fea_dict1'].map(lambda x: int(x['301'][0].split('\x03')[0]) if '301' in x else 0) # 业务场景信息的一种分类表示
    
    # 109_14:商品类目ID以及用户在该类目上的历史行为累积数量*和206域商品所属类目IDe的组合特征：浮点值 商品所属类目ID 经验证，一一对应关系
    data['user_cate_val'] = data['fea_dict1'].map(lambda x:x['508'][0].split('\x03')[1] if len(x['508'])>0 else np.NaN) 
    # 110_14和207域的组合特征：浮点值,经验证，一一对应关系
    data['user_shop_val'] = data['fea_dict1'].map(lambda x:x['509'][0].split('\x03')[1] if len(x['509'])>0 else np.NaN) 
    # 127_14和216域的组合特征：浮点值 经验证，一一对应关系
    data['user_brand_val'] = data['fea_dict1'].map(lambda x:x['702'][0].split('\x03')[1] if len(x['702'])>0 else np.NaN) 
    
    data['user_intentions'] = data['fea_dict1'].map(lambda x:[i.split('\x03')[0] for i in x['210']]) # 商品关联用户意图ID：多值
    data['user_intentions_val'] = data['fea_dict1'].map(lambda x:x['853']) # 150_14和210域的组合特征：多值，浮点值
    data.drop(columns=['feature_num1','feature_list1', 'fea_dict1'], inplace=True)


In [7]:
common_fea_train = pd.read_pickle(os.path.join(data_path2, f'fea_train.pkl'))

In [9]:
i = 1
for sam_chunk in sample_train:
    print(i,sam_chunk.shape)
    item_fea_fn(sam_chunk)
    
    sam_chunk = sam_chunk.merge(common_fea_train, how='left', on='common_feature_index')
    sam_chunk.to_pickle(os.path.join(data_path2, f'sample_train{i}.pkl'))
    i = i + 1

1 (2500000, 6)
2 (2500000, 6)
3 (2500000, 6)
4 (2500000, 6)
5 (2500000, 6)
6 (2500000, 6)
7 (2500000, 6)
8 (2500000, 6)
9 (2500000, 6)
10 (2500000, 6)
11 (2500000, 6)
12 (2500000, 6)
13 (2500000, 6)
14 (2500000, 6)
15 (2500000, 6)
16 (2300135, 6)
