In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [2]:
train_df =pd.read_pickle('train3.pkl')
train_df.replace("\\N",-1,inplace=True)
train_df=train_df.astype(float,inplace=True)
train_df['age']  = train_df['age'] -1
train_df['gender']  = train_df['gender'] -1

print(train_df)

         user_id  active_days  click_times_total  age  gender  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  advertiser_id_0  \
0            1.0         10.0               14.0  3.0     0.0   -9.808951   37.956196    7.941766   28.995136   58.130238   11.259385    5.808714  -13.792348   33.056450   26.614384     3.608611    36.279411    29.554661    -2.248151    21.378246    -2.297685        19.044678   
1            2.0         28.0               46.0  9.0     0.0 -128.622192 -112.712936    0.871585  215.566101  110.969002  -68.562981  -11.905766  144.337372   75.368713   49.570419   -88.037415   180.578781    18.416990     1.990750    39.252247   -67.675323        33.550327   
2            3.0         23.0               30.0  6.0     1.0   29.386518   -1.617431  -70.038635   90.258026  -76.959198  131.956375   -9.560961  -42.434490   

In [3]:
final_train_x_df = train_df.drop(['age','user_id','gender'], axis=1)
#final_train_x_df = train_df.drop(['age','user_id','gender','active_days'], axis=1)
final_train_y_df = train_df['gender']
X_train, X_test, y_train, y_test = train_test_split(final_train_x_df, final_train_y_df, test_size=0.20, random_state=42)
#train_data_show_df =train_df.drop(['active_days', 'click_times_total','max_clicked_industry_cnt','clicked_industry','clicked_advertiser','max_clicked_advertiser_cnt','max_clicked_industry_ratio','max_clicked_advertiser_ratio'], axis=1)
#print(X_train)
num_normal_features = ['_clicks_max_click_cnt','_max_clicked_ratio','_clicks_min_click_cnt','_min_clicked_ratio','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_date_features  = [ '_clicks_max_click_cnt', '_clicks_min_click_cnt','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_features = ['click_times_total'] +\
                [f'date{i}'  for i in num_date_features] + \
                [f'wday{i}'  for i in num_date_features] + \
                [f'month{i}'  for i in num_date_features] + \
                 [f'product_id{i}'  for i in num_normal_features] + \
                 [f'product_category{i}'  for i in num_normal_features] + \
                [f'industry{i}'  for i in num_normal_features] + \
                [f'advertiser_id{i}'  for i in num_normal_features]

#print(num_features)

c_features = ['industry_clicks_max_click','industry_clicks_min_click',
              'advertiser_id_clicks_max_click','advertiser_id_clicks_min_click',
              'product_id_clicks_max_click','product_id_clicks_min_click',
              'product_category_clicks_max_click','product_category_clicks_min_click',
             ]
features= num_features + c_features
#features= [f"tfidf_{i}" for i in range(317)] +['active_days','click_times_total']
features= [f"industry_{i}" for i in range(16)] + [f"advertiser_id_{i}" for i in range(64)] +['active_days','click_times_total']
#train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=[   'max_clicked_industry', 'max_clicked_advertiser_id' ], categorical_feature=['max_clicked_industry','max_clicked_advertiser_id'])
#train_data = lgb.Dataset(X_train, label=y_train, feature_name=features, categorical_feature=c_features,free_raw_data=False)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=features,free_raw_data=False)

eval_data = lgb.Dataset(X_test, label=y_test, feature_name=features, categorical_feature=c_features,free_raw_data=False,reference=train_data)


In [4]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':2,
    'metric': 'multi_error',
    'num_leaves': 256,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                valid_sets=[train_data,eval_data],
                num_boost_round = 2000,
                early_stopping_rounds=50
               )
                #early_stopping_rounds=5)
gbm.save_model('gender_emb.txt')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...




[1]	training's multi_error: 0.330383	valid_1's multi_error: 0.330633
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_error: 0.330383	valid_1's multi_error: 0.330633
[3]	training's multi_error: 0.327421	valid_1's multi_error: 0.327817
[4]	training's multi_error: 0.256446	valid_1's multi_error: 0.258794
[5]	training's multi_error: 0.219035	valid_1's multi_error: 0.221639
[6]	training's multi_error: 0.194422	valid_1's multi_error: 0.198078
[7]	training's multi_error: 0.180735	valid_1's multi_error: 0.184483
[8]	training's multi_error: 0.17096	valid_1's multi_error: 0.175483
[9]	training's multi_error: 0.163881	valid_1's multi_error: 0.168789
[10]	training's multi_error: 0.158443	valid_1's multi_error: 0.163533
[11]	training's multi_error: 0.154424	valid_1's multi_error: 0.159783
[12]	training's multi_error: 0.151129	valid_1's multi_error: 0.156683
[13]	training's multi_error: 0.14851	valid_1's multi_error: 0.154211
[14]	training's multi_error: 0.146322	v

[119]	training's multi_error: 0.0959833	valid_1's multi_error: 0.117539
[120]	training's multi_error: 0.0958139	valid_1's multi_error: 0.1176
[121]	training's multi_error: 0.0955819	valid_1's multi_error: 0.117478
[122]	training's multi_error: 0.0953403	valid_1's multi_error: 0.117394
[123]	training's multi_error: 0.0950944	valid_1's multi_error: 0.117372
[124]	training's multi_error: 0.0949069	valid_1's multi_error: 0.117294
[125]	training's multi_error: 0.0947125	valid_1's multi_error: 0.117128
[126]	training's multi_error: 0.0945639	valid_1's multi_error: 0.117156
[127]	training's multi_error: 0.0943861	valid_1's multi_error: 0.117056
[128]	training's multi_error: 0.0941736	valid_1's multi_error: 0.116972
[129]	training's multi_error: 0.0939569	valid_1's multi_error: 0.116761
[130]	training's multi_error: 0.0938194	valid_1's multi_error: 0.116756
[131]	training's multi_error: 0.0936264	valid_1's multi_error: 0.116689
[132]	training's multi_error: 0.0934611	valid_1's multi_error: 0.1

[234]	training's multi_error: 0.0760444	valid_1's multi_error: 0.113367
[235]	training's multi_error: 0.0759278	valid_1's multi_error: 0.113394
[236]	training's multi_error: 0.0757792	valid_1's multi_error: 0.113478
[237]	training's multi_error: 0.0755569	valid_1's multi_error: 0.113378
[238]	training's multi_error: 0.0753875	valid_1's multi_error: 0.113367
[239]	training's multi_error: 0.0751931	valid_1's multi_error: 0.113361
[240]	training's multi_error: 0.0750639	valid_1's multi_error: 0.1133
[241]	training's multi_error: 0.0749111	valid_1's multi_error: 0.113183
[242]	training's multi_error: 0.0747528	valid_1's multi_error: 0.113222
[243]	training's multi_error: 0.0746236	valid_1's multi_error: 0.1132
[244]	training's multi_error: 0.074475	valid_1's multi_error: 0.113111
[245]	training's multi_error: 0.0743181	valid_1's multi_error: 0.113189
[246]	training's multi_error: 0.0741486	valid_1's multi_error: 0.1132
[247]	training's multi_error: 0.0739819	valid_1's multi_error: 0.113178

[350]	training's multi_error: 0.0591236	valid_1's multi_error: 0.111956
[351]	training's multi_error: 0.0589931	valid_1's multi_error: 0.111844
[352]	training's multi_error: 0.05885	valid_1's multi_error: 0.11175
[353]	training's multi_error: 0.0587389	valid_1's multi_error: 0.111844
[354]	training's multi_error: 0.0585583	valid_1's multi_error: 0.111794
[355]	training's multi_error: 0.0584014	valid_1's multi_error: 0.1118
[356]	training's multi_error: 0.0582875	valid_1's multi_error: 0.111861
[357]	training's multi_error: 0.0581903	valid_1's multi_error: 0.111867
[358]	training's multi_error: 0.0580889	valid_1's multi_error: 0.111867
[359]	training's multi_error: 0.0579625	valid_1's multi_error: 0.111822
[360]	training's multi_error: 0.0578111	valid_1's multi_error: 0.111817
[361]	training's multi_error: 0.0577319	valid_1's multi_error: 0.111911
[362]	training's multi_error: 0.0576208	valid_1's multi_error: 0.111917
[363]	training's multi_error: 0.0574444	valid_1's multi_error: 0.1118

[466]	training's multi_error: 0.0445708	valid_1's multi_error: 0.111417
[467]	training's multi_error: 0.0444403	valid_1's multi_error: 0.111461
[468]	training's multi_error: 0.0443292	valid_1's multi_error: 0.111367
[469]	training's multi_error: 0.0441847	valid_1's multi_error: 0.111433
[470]	training's multi_error: 0.0440431	valid_1's multi_error: 0.111422
[471]	training's multi_error: 0.0439264	valid_1's multi_error: 0.111417
[472]	training's multi_error: 0.0437861	valid_1's multi_error: 0.111367
[473]	training's multi_error: 0.0437194	valid_1's multi_error: 0.111406
[474]	training's multi_error: 0.0436236	valid_1's multi_error: 0.111411
[475]	training's multi_error: 0.0435014	valid_1's multi_error: 0.1114
[476]	training's multi_error: 0.0433931	valid_1's multi_error: 0.111433
[477]	training's multi_error: 0.043225	valid_1's multi_error: 0.111328
[478]	training's multi_error: 0.0431403	valid_1's multi_error: 0.111361
[479]	training's multi_error: 0.0430194	valid_1's multi_error: 0.11

In [5]:
from sklearn.preprocessing import  OneHotEncoder

before_one_hot =  y_train.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [1.]
 [0.]]
(720000, 2)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
y_pred = gbm.predict(X_train)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)
print(one_hoted_y)
classification_report(one_hoted_y, y_pred)
precision_score(one_hoted_y, y_pred,average='micro')

[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]
[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


0.9604680555555556

In [7]:
from sklearn.preprocessing import  OneHotEncoder

before_one_hot =  y_test.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)

[[0.]
 [0.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]
(180000, 2)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
y_pred = gbm.predict(X_test)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)
print(one_hoted_y)
classification_report(one_hoted_y, y_pred)
precision_score(one_hoted_y, y_pred,average='micro')

[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


0.8887888888888889