### 1. 加载数据

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [2]:
# user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
# user_info = pd.read_csv('./data_format1/user_info_format1.csv')
# train_data1 = pd.read_csv('./data_format1/train_format1.csv')
# submission = pd.read_csv('./data_format1/test_format1.csv')

In [3]:

# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')


#### merge data 

In [4]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)

matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(user_info, on='user_id', how='left')

In [5]:
matrix 

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,365952,1203,0.0,train,0.0,1.0
1,42624,946,0.0,train,2.0,0.0
2,240000,2278,0.0,train,3.0,0.0
3,177792,951,0.0,train,0.0,1.0
4,322944,1892,0.0,train,7.0,0.0
...,...,...,...,...,...,...
23888,47231,1748,,test,0.0,0.0
23889,59519,798,,test,3.0,0.0
23890,263039,639,,test,2.0,1.0
23891,263039,3954,,test,2.0,1.0


In [6]:
user_log 

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,89355,664924,1429,3136,579.0,1111,2
1,89355,131438,1505,780,516.0,1110,0
2,89355,673082,1429,3136,579.0,1110,0
3,89355,664924,1429,3136,579.0,1110,0
4,89355,183665,1505,780,516.0,1110,0
...,...,...,...,...,...,...,...
2473900,263674,176091,389,4143,4594.0,1111,0
2473901,263674,794804,662,4143,4594.0,1111,0
2473902,263674,794804,662,4143,4594.0,1111,0
2473903,263674,176091,389,4143,4594.0,1111,0


In [7]:
# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [8]:
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1

In [9]:
matrix.isnull().sum()

user_id        0
merchant_id    0
label          0
origin         0
age_range      0
gender         0
dtype: int64

#### 按照用户分组

In [10]:
groups = user_log.groupby("user_id")

In [11]:
# 用户交互行为数量 u1 ,统计用户购买行为的次数， 或许有钱人或者购物狂比较容易复购
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [12]:
# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0，1，2，3的个数
# 统计用户购买的行为 ， 比如只是喜欢看看， 或者加入购物车满足满足， 或者直接购买的等等， 一定程度反映用户的购物喜爱或者购买力吧
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

In [13]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10
0,365952,1203,0.0,train,0,1,46,29,12,16,16,4.933333,45.0,,1.0,
1,42624,946,0.0,train,2,0,365,198,46,46,45,5.866667,313.0,,21.0,31.0
2,240000,2278,0.0,train,3,0,47,31,14,15,17,5.833333,42.0,,5.0,
3,177792,951,0.0,train,0,1,234,105,23,35,36,5.833333,177.0,,11.0,46.0
4,322944,1892,0.0,train,7,0,186,106,34,40,39,5.866667,147.0,,25.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23888,47231,1748,,test,0,0,128,97,28,39,40,5.816667,122.0,,6.0,
23889,59519,798,,test,3,0,1286,540,55,93,96,6.000000,1182.0,,16.0,88.0
23890,263039,639,,test,2,1,9,8,7,7,7,5.783333,7.0,,2.0,
23891,263039,3954,,test,2,1,9,8,7,7,7,5.783333,7.0,,2.0,


#### 2 .按照商家进行分组统计

In [14]:

# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1 ，商家是否受用户喜好
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
# 商家可能存在热门的商品， 或者品类， 进行统计
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

  


In [15]:
matrix 

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10
0,365952,1203,0.0,train,0,1,46,29,12,16,...,1915,408,175,19,2,1639.0,4.0,201.0,71.0,3518
1,42624,946,0.0,train,2,0,365,198,46,46,...,1965,292,320,6,3,1809.0,6.0,104.0,46.0,2816
2,240000,2278,0.0,train,3,0,47,31,14,15,...,1125,254,72,12,2,928.0,5.0,138.0,54.0,2604
3,177792,951,0.0,train,0,1,234,105,23,35,...,574,210,117,8,2,483.0,1.0,63.0,27.0,1932
4,322944,1892,0.0,train,7,0,186,106,34,40,...,6852,664,554,69,3,6028.0,27.0,473.0,324.0,5471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23888,47231,1748,,test,0,0,128,97,28,39,...,72,33,14,2,1,66.0,,2.0,4.0,356
23889,59519,798,,test,3,0,1286,540,55,93,...,4892,907,109,20,2,4323.0,12.0,304.0,253.0,8650
23890,263039,639,,test,2,1,9,8,7,7,...,480,201,9,2,2,418.0,1.0,49.0,12.0,1943
23891,263039,3954,,test,2,1,9,8,7,7,...,515,171,31,7,3,434.0,3.0,68.0,10.0,1592


In [16]:

# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔

  """


In [17]:
matrix.groupby( ['user_id' , "merchant_id"]).size()

user_id  merchant_id
9        2721           1
21       3350           1
22       1564           1
45       2566           1
48       4026           1
                       ..
424110   1000           1
         1352           1
         3619           1
424139   1784           1
         4523           1
Length: 23488, dtype: int64

In [18]:
#### 用户个人品牌偏好信息统计  

In [19]:
groups =  user_log.groupby( ["user_id" ,"brand_id"]) 

In [20]:
temp0= groups.size().reset_index().rename( columns ={ 0 :"ub0"})
user_log = user_log.merge(temp0,on =["user_id" ,"brand_id"] , how = 'left' )

In [21]:
temp1 = groups.action_type.value_counts().unstack().reset_index().\
    rename( columns = { 0: "ub1"  , 1:"ub2" ,2 :"ub3" , 3 : "ub4"})
user_log = user_log.merge( temp1 , on =  ["user_id" ,"brand_id"])
user_log

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,ub0,ub1,ub2,ub3,ub4
0,89355,664924,1429,3136,579,1900-01-01 11:11:00,2,4,2.0,,2.0,
1,89355,673082,1429,3136,579,1900-01-01 11:10:00,0,4,2.0,,2.0,
2,89355,664924,1429,3136,579,1900-01-01 11:10:00,0,4,2.0,,2.0,
3,89355,1039855,1429,3136,579,1900-01-01 11:11:00,2,4,2.0,,2.0,
4,89355,131438,1505,780,516,1900-01-01 11:10:00,0,2,2.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2473900,263674,176091,389,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0
2473901,263674,794804,662,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0
2473902,263674,794804,662,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0
2473903,263674,176091,389,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0


In [22]:
 groups.action_type.value_counts().unstack().reset_index()

action_type,user_id,brand_id,0,1,2,3
0,9,347,2.0,,,
1,9,620,17.0,,1.0,
2,9,1286,,,,1.0
3,9,1316,1.0,,1.0,
4,9,2276,,,,1.0
...,...,...,...,...,...,...
630417,424139,3077,1.0,,,
630418,424139,3402,3.0,,3.0,
630419,424139,6296,1.0,,,
630420,424139,7936,2.0,,2.0,


####  商家的品牌销售 ，专卖店或者综合类商铺的信息

In [23]:
temp0 = user_log.groupby( ["merchant_id" ,"brand_id"] , as_index = False).size().\
reset_index().rename( columns ={ 0 : "mb0"} )
temp1 = user_log.groupby( ["merchant_id" ,"brand_id"]).action_type.value_counts().\
        unstack().reset_index().rename( columns ={
        0 :"mb1" , 1 :"mb2", 2 :"mb3" , 3: "mb4"
})
user_log = user_log.merge( temp0 , on = ["merchant_id" ,"brand_id"] , how ="left" )
user_log = user_log.merge( temp1 , on =["merchant_id" ,"brand_id"] ,how ="left"  )
user_log

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,ub0,ub1,ub2,ub3,ub4,mb0,mb1,mb2,mb3,mb4
0,89355,664924,1429,3136,579,1900-01-01 11:11:00,2,4,2.0,,2.0,,347,318.0,,18.0,11.0
1,89355,673082,1429,3136,579,1900-01-01 11:10:00,0,4,2.0,,2.0,,347,318.0,,18.0,11.0
2,89355,664924,1429,3136,579,1900-01-01 11:10:00,0,4,2.0,,2.0,,347,318.0,,18.0,11.0
3,89355,1039855,1429,3136,579,1900-01-01 11:11:00,2,4,2.0,,2.0,,347,318.0,,18.0,11.0
4,89355,131438,1505,780,516,1900-01-01 11:10:00,0,2,2.0,,,,1866,1656.0,,24.0,186.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473900,263674,176091,389,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0,3580,3326.0,,92.0,162.0
2473901,263674,794804,662,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0,3580,3326.0,,92.0,162.0
2473902,263674,794804,662,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0,3580,3326.0,,92.0,162.0
2473903,263674,176091,389,4143,4594,1900-01-01 11:11:00,0,30,25.0,,3.0,2.0,3580,3326.0,,92.0,162.0


In [24]:
user_log_feature  = user_log.drop(['item_id' ,'merchant_id' , 'cat_id' ,"brand_id" ,"action_type" ,'time_stamp'], axis =1 )
user_log_feature.fillna( 0  ,  inplace = True)

In [25]:
user_log_feature = user_log_feature.astype(np.int32)

In [26]:
user_log_feature.groupby( "user_id").sum().reset_index()

Unnamed: 0,user_id,ub0,ub1,ub2,ub3,ub4,mb0,mb1,mb2,mb3,mb4
0,9,921,850,0,51,20,91227,82854,0,3874,4499
1,21,31906,30696,0,1086,124,6791422,6193882,0,193902,403638
2,22,188629,188146,0,483,0,12158692,11844635,0,104390,209667
3,45,64,60,0,4,0,23094,21164,0,702,1228
4,48,105,64,0,29,12,35319,30636,0,2922,1761
...,...,...,...,...,...,...,...,...,...,...,...
19107,424069,1464,1281,0,183,0,116823,102958,0,8435,5430
19108,424074,127,112,0,15,0,324453,297818,0,8047,18588
19109,424076,303,241,0,62,0,22519,18125,0,3461,933
19110,424110,961,939,0,22,0,19103,16935,1,911,1256


In [27]:
matrix = matrix.merge(user_log_feature.groupby( "user_id").sum().reset_index(), 
            on = 'user_id', 
             how ="left" )

In [28]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']

In [29]:
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
print(matrix)

matrix.to_csv("last_train_test.csv" , index =False )

       user_id  merchant_id label origin    u1   u2  u3  u4  u5        u6  \
0       365952         1203   0.0  train    46   29  12  16  16  4.933333   
1        42624          946   0.0  train   365  198  46  46  45  5.866667   
2       240000         2278   0.0  train    47   31  14  15  17  5.833333   
3       177792          951   0.0  train   234  105  23  35  36  5.833333   
4       322944         1892   0.0  train   186  106  34  40  39  5.866667   
...        ...          ...   ...    ...   ...  ...  ..  ..  ..       ...   
23888    47231         1748   nan   test   128   97  28  39  40  5.816667   
23889    59519          798   nan   test  1286  540  55  93  96  6.000000   
23890   263039          639   nan   test     9    8   7   7   7  5.783333   
23891   263039         3954   nan   test     9    8   7   7   7  5.783333   
23892   423551         2954   nan   test   197   85  36  39  40  5.916667   

       ...  age_2  age_3  age_4  age_5  age_6  age_7  age_8  g_0  g_1  g_2 

In [30]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [31]:
matrix = reduce_mem_usage(matrix)

Memory usage of dataframe is 8099727.00 MB
Memory usage after optimization is: 3321327.00 MB
Decreased by 59.0%


In [32]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,...,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
0,365952,1203,0.0,train,46,29,12,16,16,4.933594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,42624,946,0.0,train,365,198,46,46,45,5.867188,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,240000,2278,0.0,train,47,31,14,15,17,5.832031,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,177792,951,0.0,train,234,105,23,35,36,5.832031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,322944,1892,0.0,train,186,106,34,40,39,5.867188,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23888,47231,1748,,test,128,97,28,39,40,5.816406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23889,59519,798,,test,1286,540,55,93,96,6.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23890,263039,639,,test,9,8,7,7,7,5.785156,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23891,263039,3954,,test,9,8,7,7,7,5.785156,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import xgboost as xgb
# 将训练集进行切分，20%用于验证
X_train, X_valid, y_train, y_valid = train_test_split(
                    train_X, train_y, test_size=.2)


In [None]:
# 使用XGBoost
model = xgb.XGBClassifier(
    max_depth=10, #  树的深度
    n_estimators=1000, #  分类器
    colsample_bytree=0.8,  # 列采样
    subsample=0.9 , # 样本随机采样
    eta=0.01,    # 学习率 
    scale_pos_weight = 1 ,
    seed=42    
) 
model.fit(
    X_train, y_train,
    eval_metric='auc', 
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    #早停法，如果auc在10epoch没有进步就stop
    early_stopping_rounds=10 
)


model.fit(X_train, y_train)

prob = model.predict_proba(test_data)
print (prob )


[0]	validation_0-auc:0.62690	validation_1-auc:0.56685
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.64610	validation_1-auc:0.59192
[2]	validation_0-auc:0.65610	validation_1-auc:0.59619
[3]	validation_0-auc:0.65729	validation_1-auc:0.59395
[4]	validation_0-auc:0.65772	validation_1-auc:0.59358
[5]	validation_0-auc:0.65922	validation_1-auc:0.59416
[6]	validation_0-auc:0.65983	validation_1-auc:0.59235
[7]	validation_0-auc:0.67060	validation_1-auc:0.59072
[8]	validation_0-auc:0.67165	validation_1-auc:0.59019
[9]	validation_0-auc:0.67240	validation_1-auc:0.59164
[10]	validation_0-auc:0.67398	validation_1-auc:0.59371
[11]	validation_0-auc:0.67459	validation_1-auc:0.59355
[12]	validation_0-auc:0.67393	validation_1-auc:0.59445
Stopping. Best iteration:
[2]	validation_0-auc:0.65610	validation_1-auc:0.59619



In [None]:
# coding: utf-8
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

print('Starting training...')
# train
gbm = lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves= 1000, 
    max_depth= 8 ,
    learning_rate=0.01, 
    n_estimators= 1000,
    objective='binary', 
    subsample= 0.8, 
    colsample_bytree = 0.5 , 
    n_jobs=-1,
#     class_weight= 'balanced' ,
#     reg_alpha= 1.5, 
    reg_lambda= 100,
    silent=False)

gbm.fit( X_train, y_train ,
        eval_set=[(X_train, y_train) ,(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=10)
prob = gbm.predict_proba(test_data)


In [None]:
submission['prob'] = pd.Series(prob[:,1])

submission = submission.drop( 'origin' , axis =1)
submission.to_csv('prediction.csv', index=False)

In [None]:
submission