In [0]:
# 重启
# !kill -9 -1
# 安装包
# !pip install lightgbm

In [1]:
# 挂载谷歌硬盘 从drive导入数据
from google.colab import drive
drive.mount('/content/drive')
# 创建数据文件夹
!mkdir /content/dataset
# 复制文件到本地
# !cp -r /content/drive/My\ Drive/Colab_Notebooks/RS6/L11/data_format1_small /content/dataset/data_format1_small
!cp /content/drive/My\ Drive/Dataset/data_format2.zip /content/dataset/data_format2.zip
!cp /content/drive/My\ Drive/Dataset/data_format1.zip /content/dataset/data_format1.zip
# 切换当前目录
%cd /content/dataset/
# 解压缩文件
!unzip data_format2.zip
!unzip data_format1.zip
# !unzip ./data_format1_small/data_format1_small.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/dataset
Archive:  data_format2.zip
   creating: data_format2/
  inflating: data_format2/test_format2.csv  
  inflating: data_format2/train_format2.csv  
Archive:  data_format1.zip
   creating: data_format1/
  inflating: data_format1/test_format1.csv  
  inflating: data_format1/train_format1.csv  
  inflating: data_format1/user_info_format1.csv  
  inflating: data_format1/user_log_format1.csv  


### 加载数据

In [0]:
# 导入包
import pandas as pd
import gc

In [0]:
# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

In [0]:
# 加载全量样本
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

### 数据预处理

In [4]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(user_info, on='user_id', how='left')
# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,34176,3906,0.0,train,6.0,0.0
1,34176,121,0.0,train,6.0,0.0
2,34176,4356,1.0,train,6.0,0.0
3,34176,2217,0.0,train,6.0,0.0
4,230784,4818,0.0,train,0.0,0.0
...,...,...,...,...,...,...
522336,228479,3111,,test,6.0,0.0
522337,97919,2341,,test,8.0,1.0
522338,97919,3971,,test,8.0,1.0
522339,32639,3536,,test,0.0,0.0


In [5]:
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,34176,3906,0.0,train,6,0
1,34176,121,0.0,train,6,0
2,34176,4356,1.0,train,6,0
3,34176,2217,0.0,train,6,0
4,230784,4818,0.0,train,0,0
...,...,...,...,...,...,...
522336,228479,3111,,test,6,0
522337,97919,2341,,test,8,1
522338,97919,3971,,test,8,1
522339,32639,3536,,test,0,0


### 特征处理

In [6]:
# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10
0,34176,3906,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
1,34176,121,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
2,34176,4356,1.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
3,34176,2217,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0
4,230784,4818,0.0,train,0,0,54,31,17,20,19,5.166667,47.0,,7.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004,1173,71,278,282,6.000000,1770.0,,26.0,208.0
522337,97919,2341,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0
522338,97919,3971,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0
522339,32639,3536,,test,0,0,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0


In [9]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
matrix

  import sys


Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,m1_x,m2_x,m3_x,m4_x,m5_x,m6_x,m7_x,m8_x,m9_x,m1_y,m2_y,m3_y,m4_y,m5_y,m6_y,m7_y,m8_y,m9_y,m10
0,34176,3906,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,2861
1,34176,121,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,4530
2,34176,4356,1.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,1088
3,34176,2217,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,7268
4,230784,4818,0.0,train,0,0,54,31,17,20,19,5.166667,47.0,,7.0,,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,3102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004,1173,71,278,282,6.000000,1770.0,,26.0,208.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,1982
522337,97919,2341,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,703
522338,97919,3971,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,3050
522339,32639,3536,,test,0,0,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,2177


In [10]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
matrix

  """


Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,m1_x,m2_x,m3_x,m4_x,m5_x,m6_x,m7_x,m8_x,m9_x,m1_y,m2_y,m3_y,m4_y,m5_y,m6_y,m7_y,m8_y,m9_y,m10,um1,um2,um3,um4,um5,um6,um7,um8,um9
0,34176,3906,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,2861,39,20,6,1,36.0,,1.0,2.0,0.850000
1,34176,121,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,4530,14,1,1,1,13.0,,1.0,,0.050000
2,34176,4356,1.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,1088,18,2,1,1,12.0,,6.0,,0.016667
3,34176,2217,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,7268,2,1,1,1,1.0,,1.0,,0.000000
4,230784,4818,0.0,train,0,0,54,31,17,20,19,5.166667,47.0,,7.0,,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,3102,8,1,1,1,7.0,,1.0,,0.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004,1173,71,278,282,6.000000,1770.0,,26.0,208.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,1982,5,2,1,1,4.0,,1.0,,0.016667
522337,97919,2341,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,703,2,1,1,1,1.0,,1.0,,0.000000
522338,97919,3971,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,3050,16,5,2,1,12.0,,4.0,,0.150000
522339,32639,3536,,test,0,0,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,2177,3,2,1,1,2.0,,1.0,,0.000000


In [16]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,m1_x,m2_x,m3_x,m4_x,m5_x,m6_x,m7_x,m8_x,m9_x,m1_y,m2_y,m3_y,m4_y,m5_y,m6_y,m7_y,m8_y,m9_y,m10,um1,um2,um3,um4,um5,um6,um7,um8,um9,r1
0,34176,3906,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,2861,39,20,6,1,36.0,,1.0,2.0,0.850000,0.082927
1,34176,121,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,4530,14,1,1,1,13.0,,1.0,,0.050000,0.082927
2,34176,4356,1.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,1088,18,2,1,1,12.0,,6.0,,0.016667,0.082927
3,34176,2217,0.0,train,6,0,451,256,45,109,108,5.833333,410.0,,34.0,7.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,7268,2,1,1,1,1.0,,1.0,,0.000000,0.082927
4,230784,4818,0.0,train,0,0,54,31,17,20,19,5.166667,47.0,,7.0,,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,3102,8,1,1,1,7.0,,1.0,,0.050000,0.148936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004,1173,71,278,282,6.000000,1770.0,,26.0,208.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,1982,5,2,1,1,4.0,,1.0,,0.016667,0.014689
522337,97919,2341,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,703,2,1,1,1,1.0,,1.0,,0.000000,0.173913
522338,97919,3971,,test,8,1,55,29,14,17,17,4.750000,46.0,,8.0,1.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,3050,16,5,2,1,12.0,,4.0,,0.150000,0.173913
522339,32639,3536,,test,0,0,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,2177,3,2,1,1,2.0,,1.0,,0.000000,0.129032


In [17]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
# matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
matrix

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,m1_x,m2_x,m3_x,m4_x,m5_x,m6_x,m7_x,m8_x,m9_x,m1_y,m2_y,m3_y,m4_y,m5_y,m6_y,m7_y,m8_y,m9_y,m10,um1,um2,um3,um4,um5,um6,um7,um8,um9,r1,r3,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
0,34176,3906,0.0,train,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0,2861,39,20,6,1,36.0,0.0,1.0,2.0,0.850000,0.082927,0.027778,0,0,0,0,0,0,1,0,0,1,0,0
1,34176,121,0.0,train,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,4530,14,1,1,1,13.0,0.0,1.0,0.0,0.050000,0.082927,0.076923,0,0,0,0,0,0,1,0,0,1,0,0
2,34176,4356,1.0,train,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0,1088,18,2,1,1,12.0,0.0,6.0,0.0,0.016667,0.082927,0.500000,0,0,0,0,0,0,1,0,0,1,0,0
3,34176,2217,0.0,train,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0,7268,2,1,1,1,1.0,0.0,1.0,0.0,0.000000,0.082927,1.000000,0,0,0,0,0,0,1,0,0,1,0,0
4,230784,4818,0.0,train,54,31,17,20,19,5.166667,47.0,0.0,7.0,0.0,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0,3102,8,1,1,1,7.0,0.0,1.0,0.0,0.050000,0.148936,0.142857,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,2004,1173,71,278,282,6.000000,1770.0,0.0,26.0,208.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,10105,4154,542,50,18,8997.0,9.0,687.0,412.0,1982,5,2,1,1,4.0,0.0,1.0,0.0,0.016667,0.014689,0.250000,0,0,0,0,0,0,1,0,0,1,0,0
522337,97919,2341,,test,55,29,14,17,17,4.750000,46.0,0.0,8.0,1.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,5543,1592,352,93,19,4548.0,6.0,815.0,174.0,703,2,1,1,1,1.0,0.0,1.0,0.0,0.000000,0.173913,1.000000,0,0,0,0,0,0,0,0,1,0,1,0
522338,97919,3971,,test,55,29,14,17,17,4.750000,46.0,0.0,8.0,1.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,28892,7587,272,7,2,24602.0,94.0,2608.0,1588.0,3050,16,5,2,1,12.0,0.0,4.0,0.0,0.150000,0.173913,0.333333,0,0,0,0,0,0,0,0,1,0,1,0
522339,32639,3536,,test,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,14027,4956,322,19,3,12807.0,29.0,793.0,398.0,2177,3,2,1,1,2.0,0.0,1.0,0.0,0.000000,0.129032,0.500000,1,0,0,0,0,0,0,0,0,1,0,0


### 训练集和测试集划分

In [18]:
# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
del temp, matrix
gc.collect()

1555

### 模型训练

In [0]:
# 导入用到的模型包
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
# 将训练集进行切分，20%用于验证
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)

In [23]:
# 使用XGBoost
model = xgb.XGBClassifier(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42     
)
model.fit(
    X_train, y_train,
    eval_metric='auc', 
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    #早停法，如果auc在10epoch没有进步就stop
    early_stopping_rounds=10 
)
model.fit(X_train, y_train)

[0]	validation_0-auc:0.59588	validation_1-auc:0.598048
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.618922	validation_1-auc:0.625476
[2]	validation_0-auc:0.62867	validation_1-auc:0.6339
[3]	validation_0-auc:0.627127	validation_1-auc:0.640181
[4]	validation_0-auc:0.625149	validation_1-auc:0.64167
[5]	validation_0-auc:0.630772	validation_1-auc:0.651432
[6]	validation_0-auc:0.631947	validation_1-auc:0.654318
[7]	validation_0-auc:0.63257	validation_1-auc:0.653732
[8]	validation_0-auc:0.633137	validation_1-auc:0.650658
[9]	validation_0-auc:0.631945	validation_1-auc:0.649701
[10]	validation_0-auc:0.632001	validation_1-auc:0.646786
[11]	validation_0-auc:0.629908	validation_1-auc:0.643137
[12]	validation_0-auc:0.62925	validation_1-auc:0.641778
[13]	validation_0-auc:0.630849	validation_1-auc:0.641225
[14]	validation_0-auc:0.632077	validation_1-auc:0.645202
[15]	

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=300, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              silent=None, subsample=0.8, verbosity=1)

In [20]:
# 使用LightGBM模型
model = lgb.LGBMClassifier(
    num_leaves=51,
    max_depth=10,
    boosting_type='gbdt',
    objective='binary',
    learning_rate=0.015,
    n_estimators=2000,
    subsample=0.75,
    subsample_freq=2,
    reg_lambda=0.28,
    reg_alpha=0.12,
    colsample_bytree=0.8,
    min_child_samples=300,
    min_split_gain=0.1
)
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_metric='auc', 
    early_stopping_rounds=100
)

[1]	training's auc: 0.644572	training's binary_logloss: 0.229872	valid_1's auc: 0.636098	valid_1's binary_logloss: 0.229493
Training until validation scores don't improve for 100 rounds.
[2]	training's auc: 0.647401	training's binary_logloss: 0.229549	valid_1's auc: 0.636064	valid_1's binary_logloss: 0.229225
[3]	training's auc: 0.652825	training's binary_logloss: 0.229232	valid_1's auc: 0.63964	valid_1's binary_logloss: 0.228962
[4]	training's auc: 0.656787	training's binary_logloss: 0.22893	valid_1's auc: 0.642268	valid_1's binary_logloss: 0.228706
[5]	training's auc: 0.660627	training's binary_logloss: 0.228675	valid_1's auc: 0.645861	valid_1's binary_logloss: 0.228483
[6]	training's auc: 0.660755	training's binary_logloss: 0.228414	valid_1's auc: 0.646503	valid_1's binary_logloss: 0.228252
[7]	training's auc: 0.661864	training's binary_logloss: 0.22815	valid_1's auc: 0.648312	valid_1's binary_logloss: 0.228029
[8]	training's auc: 0.662586	training's binary_logloss: 0.227902	valid_1

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
               importance_type='split', learning_rate=0.015, max_depth=10,
               min_child_samples=300, min_child_weight=0.001,
               min_split_gain=0.1, n_estimators=2000, n_jobs=-1, num_leaves=51,
               objective='binary', random_state=None, reg_alpha=0.12,
               reg_lambda=0.28, silent=True, subsample=0.75,
               subsample_for_bin=200000, subsample_freq=2)

### 测试集预测

In [0]:
prob = model.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('prediction.csv', index=False)