**特征提取思路**
1. 以用户为主体，groupby用户的id，将用户的个人画像信息提取出来

2. 以商家为主体，groupby商家的id，将每个商家的画像信息提取出来

3. 将用户与商家相结合，同时groupby用户的id和商家的id，将用户与商家的关联建立起来

### 读取数据

In [3]:
import gc
import pandas as pd 

paths = './data_format1'
data = pd.read_csv(f'{paths}/user_log_format1.csv', dtype={'time_stamp':'str'})
data1 = pd.read_csv(f'{paths}/user_info_format1.csv')
data2 = pd.read_csv(f'{paths}/train_format1.csv')
submission = pd.read_csv(f'{paths}/test_format1.csv')

### 数据预处理及缺失值处理

In [4]:
data.rename(columns={'seller_id':'merchant_id'}, inplace=True)
data['user_id'] = data['user_id'].astype('int32')
data['merchant_id'] = data['merchant_id'].astype('int32')
data['item_id'] = data['item_id'].astype('int32')
data['cat_id'] = data['cat_id'].astype('int32')
data['brand_id'].fillna(0, inplace=True)
data['brand_id'] = data['brand_id'].astype('int32')
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%H%M')

In [5]:
data2['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([data2, submission], ignore_index=True, sort=False)
matrix.drop(['prob'], axis=1, inplace=True)
matrix = matrix.merge(data1, on='user_id', how='left')

del data1, data2
gc.collect()

0

In [6]:
matrix['age_range'].fillna(0, inplace=True)
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

### 画像信息提取

用户画像

In [7]:
#特征处理
groups = data.groupby(['user_id'])
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 这个用法，妙啊.value_counts会得到多重索引，使用unstack来解开索引绑定
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

商家画像提取

In [8]:
groups = data.groupby(['merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={
    'user_id':'m2',
    'item_id':'m3', 
    'cat_id':'m4', 
    'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

提取用户与商家组合信息

In [9]:
groups = data.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={
    'item_id':'um2',
    'cat_id':'um3',
    'brand_id':'um4'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={
    0:'um5',
    1:'um6',
    2:'um7',
    3:'um8'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('frist', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.seconds/3600
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

gc.collect()
del data

既有数据中，提取有意义信息

In [10]:
matrix['r1'] = matrix['u9']/matrix['u7'] #用户购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] #商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5'] #不同用户不同商家购买点击比
matrix.fillna(0, inplace=True)

In [11]:
# 对年龄性别等类别信息使用独热编码，一定程度上提升模型准确率
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

1.  从最终变量matrix中，得到训练集以及待求解的测试集

2. um5 和 u7 存在很多零值，导致 r1, r3 出现 inf 值

3. 这里使用均值来填补 r1 与 r3 的值

4. 划分训练集并获取numpy类型

In [12]:
import numpy as np

indices_r1 = np.isinf(matrix['r1'])
indices_r3 = np.isinf(matrix['r3'])
mean_r1 = matrix['r1'][~indices_r1].mean()
mean_r3 = matrix['r3'][~indices_r3].mean()
matrix['r1'].replace(np.Inf,mean_r1,inplace=True)
matrix['r3'].replace(np.Inf,mean_r3,inplace=True)

# 利用天池提供的另外数据集
data_train = pd.read_csv('./data_format2/train_format2.csv')
temp = data_train[data_train['label']==1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = data_train[data_train['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m11'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = data_train[data_train['label']==0].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m12'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
matrix.fillna(0, inplace=True)

保存特征提取后的数据

In [14]:
matrix.to_csv('./result/matrix.csv')