In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc
import lightgbm as lgb

In [3]:
user_log_data = pd.read_csv('user_log_format1.csv', dtype={'time_stamp':'str'})
# 格式化
user_log_data = user_log_data.rename(columns={'seller_id': 'merchant_id'})
user_log_data['user_id'] = user_log_data['user_id'].astype('int32')
user_log_data['merchant_id'] = user_log_data['merchant_id'].astype('int32')
user_log_data['item_id'] = user_log_data['item_id'].astype('int32')
user_log_data['cat_id'] = user_log_data['cat_id'].astype('int32')
user_log_data['brand_id'].fillna(0, inplace=True)
user_log_data['brand_id'] = user_log_data['brand_id'].astype('int32')
user_log_data['time_stamp'] = pd.to_datetime(user_log_data['time_stamp'], format='%m%d')
user_log_data['month'] = user_log_data['time_stamp'].astype(str).str[5:7]

user_info_data = pd.read_csv('user_info_format1.csv')

train_format_data = pd.read_csv('train_format1.csv')

test_format_data = pd.read_csv('test_format1.csv')

In [4]:
# 将train_format和test_format接在一起计算特征，最后通过origin来区分来源
train_format_data['origin'] = 'train'
test_format_data['origin'] = 'test'
matrix = pd.concat([train_format_data, test_format_data], ignore_index=True)
matrix = matrix.drop(['prob'], axis=1)

# 合并用户信息
matrix = matrix.merge(user_info_data, on='user_id', how='left')

# 填入缺失值
matrix['age_range'].fillna(0, inplace=True)
matrix['gender'].fillna(2, inplace=True)

#格式化
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

# 用one hot encode方法修改age_range和gender属性
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)

temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)

matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
0,34176,3906,0.0,train,0,0,0,0,0,0,1,0,0,1,0,0
1,34176,121,0.0,train,0,0,0,0,0,0,1,0,0,1,0,0
2,34176,4356,1.0,train,0,0,0,0,0,0,1,0,0,1,0,0
3,34176,2217,0.0,train,0,0,0,0,0,0,1,0,0,1,0,0
4,230784,4818,0.0,train,1,0,0,0,0,0,0,0,0,1,0,0


In [5]:
# 每个用户的特征
groups = user_log_data.groupby(['user_id'])

groups.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,month
0,328862,323294,833,2882,2661,1900-08-29,0,08
1,328862,844400,1271,2882,2661,1900-08-29,0,08
2,328862,575153,1271,2882,2661,1900-08-29,0,08
3,328862,996875,1271,2882,2661,1900-08-29,0,08
4,328862,1086186,1271,1253,1049,1900-08-29,0,08
...,...,...,...,...,...,...,...,...
54925276,208016,51712,898,3763,8434,1900-11-11,2,11
54925277,208016,1024462,898,3763,8434,1900-11-11,2,11
54925278,208016,272094,898,1346,7995,1900-11-11,0,11
54925279,208016,107662,898,1346,7995,1900-11-11,0,11


In [6]:
# 点击、加购、购买、收藏总记录数：u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

# 用户有操作记录的item_id的个数（去重后）：u2
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 用户有操作记录的cat_id的个数（去重后）：u3
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 用户有操作记录的merchant_id的个数（去重后）：u4
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 用户有操作记录的brand_id的个数（去重后）：u5
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 用户有操作记录的最近时间和最晚时间的间隔天数：u6 
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.days
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

# 点击、加购、购买、收藏分别记录数：u7、u8、u9、u10
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

# 缺失值填补
matrix['u7'].fillna(0, inplace=True)
matrix['u8'].fillna(0, inplace=True)
matrix['u9'].fillna(0, inplace=True)
matrix['u10'].fillna(0, inplace=True)

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10
0,34176,3906,0.0,train,0,0,0,0,0,0,...,451,256,45,109,108,174,410.0,0.0,34.0,7.0
1,34176,121,0.0,train,0,0,0,0,0,0,...,451,256,45,109,108,174,410.0,0.0,34.0,7.0
2,34176,4356,1.0,train,0,0,0,0,0,0,...,451,256,45,109,108,174,410.0,0.0,34.0,7.0
3,34176,2217,0.0,train,0,0,0,0,0,0,...,451,256,45,109,108,174,410.0,0.0,34.0,7.0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,54,31,17,20,19,163,47.0,0.0,7.0,0.0


In [7]:
# 每个用户在5-11月内的购买数量
buy_11 = user_log_data[(user_log_data['month'] == "11") & (user_log_data['action_type'] == 2)]
buy_10 = user_log_data[(user_log_data['month'] == "10") & (user_log_data['action_type'] == 2)]
buy_9 = user_log_data[(user_log_data['month'] == "09") & (user_log_data['action_type'] == 2)]
buy_8 = user_log_data[(user_log_data['month'] == "08") & (user_log_data['action_type'] == 2)]
buy_7 = user_log_data[(user_log_data['month'] == "07") & (user_log_data['action_type'] == 2)]
buy_6 = user_log_data[(user_log_data['month'] == "06") & (user_log_data['action_type'] == 2)]
buy_5 = user_log_data[(user_log_data['month'] == "05") & (user_log_data['action_type'] == 2)]

user_buy_11 = buy_11.groupby('user_id', as_index=False)['month'].agg({'user_buy_11': 'count'}).fillna(0)
user_buy_10 = buy_10.groupby('user_id', as_index=False)['month'].agg({'user_buy_10': 'count'}).fillna(0)
user_buy_9 = buy_9.groupby('user_id', as_index=False)['month'].agg({'user_buy_9': 'count'}).fillna(0)
user_buy_8 = buy_8.groupby('user_id', as_index=False)['month'].agg({'user_buy_8': 'count'}).fillna(0)
user_buy_7 = buy_7.groupby('user_id', as_index=False)['month'].agg({'user_buy_7': 'count'}).fillna(0)
user_buy_6 = buy_6.groupby('user_id', as_index=False)['month'].agg({'user_buy_6': 'count'}).fillna(0)
user_buy_5 = buy_5.groupby('user_id', as_index=False)['month'].agg({'user_buy_5': 'count'}).fillna(0)

matrix = matrix.merge(user_buy_11, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_10, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_9, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_8, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_7, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_6, on=['user_id'], how='left')
matrix = matrix.merge(user_buy_5, on=['user_id'], how='left')

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,u8,u9,u10,user_buy_11,user_buy_10,user_buy_9,user_buy_8,user_buy_7,user_buy_6,user_buy_5
0,34176,3906,0.0,train,0,0,0,0,0,0,...,0.0,34.0,7.0,15,6.0,2.0,1.0,1.0,4.0,5.0
1,34176,121,0.0,train,0,0,0,0,0,0,...,0.0,34.0,7.0,15,6.0,2.0,1.0,1.0,4.0,5.0
2,34176,4356,1.0,train,0,0,0,0,0,0,...,0.0,34.0,7.0,15,6.0,2.0,1.0,1.0,4.0,5.0
3,34176,2217,0.0,train,0,0,0,0,0,0,...,0.0,34.0,7.0,15,6.0,2.0,1.0,1.0,4.0,5.0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,0.0,7.0,0.0,1,1.0,,,1.0,4.0,


In [8]:
# 每个商店的特征
groups = user_log_data.groupby(['merchant_id'])

groups.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,month
0,328862,323294,833,2882,2661,1900-08-29,0,08
1,328862,844400,1271,2882,2661,1900-08-29,0,08
2,328862,575153,1271,2882,2661,1900-08-29,0,08
3,328862,996875,1271,2882,2661,1900-08-29,0,08
4,328862,1086186,1271,1253,1049,1900-08-29,0,08
...,...,...,...,...,...,...,...,...
5008380,178610,832432,1181,4103,7112,1900-11-10,0,11
6003615,326907,490942,276,4103,7112,1900-11-11,0,11
6174959,353280,832432,1181,4103,7112,1900-11-11,0,11
6505733,338289,832432,1181,4103,7112,1900-11-10,0,11


In [9]:
# 商店被点击、加购、购买、收藏总记录数：m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 商店有操作记录的user_id的个数（去重后）：m2
temp = groups['user_id'].agg([('m2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 商店有操作记录的item_id的个数（去重后）：m3
temp = groups['item_id'].agg([('m3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 商店有操作记录的cat_id的个数（去重后）：m4
temp = groups['cat_id'].agg([('m4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 商店有操作记录的brand_id的个数（去重后）：m5
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 商店被点击、加购、购买、收藏各自的记录数：m6、m7、m8、m9
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 缺失值填补
matrix['m6'].fillna(0, inplace=True)
matrix['m7'].fillna(0, inplace=True)
matrix['m8'].fillna(0, inplace=True)
matrix['m9'].fillna(0, inplace=True)

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,user_buy_5,m1,m2,m3,m4,u5_y,m6,m7,m8,m9
0,34176,3906,0.0,train,0,0,0,0,0,0,...,5.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0
1,34176,121,0.0,train,0,0,0,0,0,0,...,5.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0
2,34176,4356,1.0,train,0,0,0,0,0,0,...,5.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0
3,34176,2217,0.0,train,0,0,0,0,0,0,...,5.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0


In [10]:
# 每个商店在5-11月内的购买数量
merchant_buy_11 = buy_11.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_11': 'count'}).fillna(0)
merchant_buy_10 = buy_10.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_10': 'count'}).fillna(0)
merchant_buy_9 = buy_9.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_9': 'count'}).fillna(0)
merchant_buy_8 = buy_8.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_8': 'count'}).fillna(0)
merchant_buy_7 = buy_7.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_7': 'count'}).fillna(0)
merchant_buy_6 = buy_6.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_6': 'count'}).fillna(0)
merchant_buy_5 = buy_5.groupby('merchant_id', as_index=False)['month'].agg({'merchant_buy_5': 'count'}).fillna(0)

matrix = matrix.merge(merchant_buy_11, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_10, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_9, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_8, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_7, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_6, on=['merchant_id'], how='left')
matrix = matrix.merge(merchant_buy_5, on=['merchant_id'], how='left')

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,m7,m8,m9,merchant_buy_11,merchant_buy_10,merchant_buy_9,merchant_buy_8,merchant_buy_7,merchant_buy_6,merchant_buy_5
0,34176,3906,0.0,train,0,0,0,0,0,0,...,28.0,410.0,961.0,178,29.0,61.0,16.0,14.0,62.0,50.0
1,34176,121,0.0,train,0,0,0,0,0,0,...,121.0,4780.0,2699.0,3299,267.0,449.0,164.0,299.0,77.0,225.0
2,34176,4356,1.0,train,0,0,0,0,0,0,...,16.0,963.0,196.0,167,107.0,123.0,149.0,391.0,26.0,
3,34176,2217,0.0,train,0,0,0,0,0,0,...,101.0,3721.0,4150.0,2715,371.0,169.0,81.0,129.0,156.0,100.0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,129.0,2733.0,1959.0,2161,102.0,86.0,68.0,89.0,130.0,97.0


In [14]:
# 每个商店的复购次数
temp = user_log_data[user_log_data['action_type'] == 2]
temp = temp.drop_duplicates(subset=['merchant_id', 'time_stamp', 'user_id'])
temp = temp.groupby('merchant_id').apply(lambda x: x['user_id'].count() - x['user_id'].nunique())
temp = temp.reset_index().rename(columns={0:'repeat_count'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,m8,m9,merchant_buy_11,merchant_buy_10,merchant_buy_9,merchant_buy_8,merchant_buy_7,merchant_buy_6,merchant_buy_5,repeat_count
0,34176,3906,0.0,train,0,0,0,0,0,0,...,410.0,961.0,178,29.0,61.0,16.0,14.0,62.0,50.0,16
1,34176,121,0.0,train,0,0,0,0,0,0,...,4780.0,2699.0,3299,267.0,449.0,164.0,299.0,77.0,225.0,182
2,34176,4356,1.0,train,0,0,0,0,0,0,...,963.0,196.0,167,107.0,123.0,149.0,391.0,26.0,,66
3,34176,2217,0.0,train,0,0,0,0,0,0,...,3721.0,4150.0,2715,371.0,169.0,81.0,129.0,156.0,100.0,77
4,230784,4818,0.0,train,1,0,0,0,0,0,...,2733.0,1959.0,2161,102.0,86.0,68.0,89.0,130.0,97.0,109


In [15]:
# 每个（用户——商店）对的特征
groups = user_log_data.groupby(['user_id', 'merchant_id'])

groups.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,month
0,328862,323294,833,2882,2661,1900-08-29,0,08
1,328862,844400,1271,2882,2661,1900-08-29,0,08
2,328862,575153,1271,2882,2661,1900-08-29,0,08
3,328862,996875,1271,2882,2661,1900-08-29,0,08
4,328862,1086186,1271,1253,1049,1900-08-29,0,08
...,...,...,...,...,...,...,...,...
54925294,208016,484085,955,4577,2841,1900-11-11,0,11
54925295,208016,1024462,898,3763,8434,1900-11-10,0,11
54925296,208016,188869,898,3716,7995,1900-11-10,0,11
54925318,208016,572517,898,4992,6137,1900-11-10,0,11


In [16]:
# 用户——商店点击、加购、购买、收藏总记录数：um1
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户——商店有操作记录的item_id的个数（去重后）：um2
temp = groups['item_id'].agg([('um2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户——商店有操作记录的cat_id的个数（去重后）：um3
temp = groups['cat_id'].agg([('um3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户——商店有操作记录的brand_id的个数（去重后）：um4
temp = groups['brand_id'].agg([('um4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户——商店被点击、加购、购买、收藏各自的记录数：um5、um6、um7、um8
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5',1:'um6',2:'um7',3:'um8'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户——商店有操作记录的最近时间和最晚时间的间隔天数：um9 
temp = groups['time_stamp'].agg([('frist', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.days
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 缺失值填补
matrix['um5'].fillna(0, inplace=True)
matrix['um6'].fillna(0, inplace=True)
matrix['um7'].fillna(0, inplace=True)
matrix['um8'].fillna(0, inplace=True)

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,repeat_count,um1,um2,um3,um4,um5,um6,um7,um8,um9
0,34176,3906,0.0,train,0,0,0,0,0,0,...,16,39,20,6,1,36.0,0.0,1.0,2.0,22
1,34176,121,0.0,train,0,0,0,0,0,0,...,182,14,1,1,1,13.0,0.0,1.0,0.0,3
2,34176,4356,1.0,train,0,0,0,0,0,0,...,66,18,2,1,1,12.0,0.0,6.0,0.0,1
3,34176,2217,0.0,train,0,0,0,0,0,0,...,77,2,1,1,1,1.0,0.0,1.0,0.0,0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,109,8,1,1,1,7.0,0.0,1.0,0.0,3


In [17]:
# 每个用户——商店在5-11月内的购买数量
user_merchant_buy_11 = buy_11.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_11': 'count'}).fillna(0)
user_merchant_buy_10 = buy_10.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_10': 'count'}).fillna(0)
user_merchant_buy_9 = buy_9.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_9': 'count'}).fillna(0)
user_merchant_buy_8 = buy_8.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_8': 'count'}).fillna(0)
user_merchant_buy_7 = buy_7.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_7': 'count'}).fillna(0)
user_merchant_buy_6 = buy_6.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_6': 'count'}).fillna(0)
user_merchant_buy_5 = buy_5.groupby(['user_id','merchant_id'], as_index=False)['month'].agg({'user_merchant_buy_5': 'count'}).fillna(0)

matrix = matrix.merge(user_merchant_buy_11, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_10, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_9, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_8, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_7, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_6, on=['user_id','merchant_id'], how='left')
matrix = matrix.merge(user_merchant_buy_5, on=['user_id','merchant_id'], how='left')

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,um7,um8,um9,user_merchant_buy_11,user_merchant_buy_10,user_merchant_buy_9,user_merchant_buy_8,user_merchant_buy_7,user_merchant_buy_6,user_merchant_buy_5
0,34176,3906,0.0,train,0,0,0,0,0,0,...,1.0,2.0,22,1,,,,,,
1,34176,121,0.0,train,0,0,0,0,0,0,...,1.0,0.0,3,1,,,,,,
2,34176,4356,1.0,train,0,0,0,0,0,0,...,6.0,0.0,1,6,,,,,,
3,34176,2217,0.0,train,0,0,0,0,0,0,...,1.0,0.0,0,1,,,,,,
4,230784,4818,0.0,train,1,0,0,0,0,0,...,1.0,0.0,3,1,,,,,,


In [18]:
# 计算购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] # 用户购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] # 商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5'] #不同用户不同商家购买点击比

matrix.fillna(0, inplace=True)

matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_0,age_1,age_2,age_3,age_4,age_5,...,user_merchant_buy_11,user_merchant_buy_10,user_merchant_buy_9,user_merchant_buy_8,user_merchant_buy_7,user_merchant_buy_6,user_merchant_buy_5,r1,r2,r3
0,34176,3906,0.0,train,0,0,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.082927,0.027572,0.027778
1,34176,121,0.0,train,0,0,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.082927,0.066145,0.076923
2,34176,4356,1.0,train,0,0,0,0,0,0,...,6,0.0,0.0,0.0,0.0,0.0,0.0,0.082927,0.158024,0.5
3,34176,2217,0.0,train,0,0,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.082927,0.071243,1.0
4,230784,4818,0.0,train,1,0,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.148936,0.063164,0.142857


In [19]:
# 利用origin的标记将train_format和test_format分开
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)

train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)