In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [2]:
train_df =pd.read_pickle('train4.pkl')
train_df['age']  = train_df['age'] -1
train_df['gender']  = train_df['gender'] -1

valid_df =pd.read_pickle('valid4.pkl')
valid_df['gender']  = valid_df['gender'] -1

print(train_df)
print(valid_df)

        user_id  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top1_0  industry_top1_1  industry_top1_2  \
0         43718   -3.750512    1.645344   -0.083850   -4.099385   -1.014840    0.212098   -2.912475   -0.542816    0.502872    1.921272    -2.787402     0.725691    -1.245059    -0.047494     2.990072     1.889894        -5.544167         2.163624        -6.742880   
1        438200    1.114121   -0.625552    0.338307   -0.836104   -0.004797    0.725242   -1.546091   -1.569359   -1.329555   -0.773546    -0.010685    -0.027870    -1.450398     0.025883     0.986282     1.749809         3.614543        -0.057782        -1.737635   
2         74183    0.921379    1.065745    1.196028   -1.817816    0.382766   -0.806924   -0.931389   -2.712002   -0.037335    0.225830     0.027066     1.322888     0.541383     2.622776    -1.01

In [3]:
final_train_x_df = train_df.drop(['age','user_id','gender'], axis=1)
#final_train_x_df = train_df.drop(['age','user_id','gender','active_days'], axis=1)
final_train_y_df = train_df['gender']

final_valid_x_df = valid_df.drop(['age','user_id','gender'], axis=1)
final_valid_y_df = valid_df['gender']
num_normal_features = ['_clicks_max_click_cnt','_max_clicked_ratio','_clicks_min_click_cnt','_min_clicked_ratio','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_date_features  = [ '_clicks_max_click_cnt', '_clicks_min_click_cnt','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_features = ['click_times_total'] +\
                [f'date{i}'  for i in num_date_features] + \
                [f'wday{i}'  for i in num_date_features] + \
                [f'month{i}'  for i in num_date_features] + \
                 [f'product_id{i}'  for i in num_normal_features] + \
                 [f'product_category{i}'  for i in num_normal_features] + \
                [f'industry{i}'  for i in num_normal_features] + \
                [f'advertiser_id{i}'  for i in num_normal_features]

#print(num_features)

c_features = ['industry_clicks_max_click','industry_clicks_min_click',
              'advertiser_id_clicks_max_click','advertiser_id_clicks_min_click',
              'product_id_clicks_max_click','product_id_clicks_min_click',
              'product_category_clicks_max_click','product_category_clicks_min_click',
             ]
features= num_features + c_features
features= [f"industry_{i}" for i in range(16)] + [f"advertiser_id_{i}" for i in range(64)] +['active_days','click_times_total'] + [f"industry_top1_{i}" for i in range(16)] + [f"advertiser_id_top1_{i}" for i in range(64)]
#train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=[   'max_clicked_industry', 'max_clicked_advertiser_id' ], categorical_feature=['max_clicked_industry','max_clicked_advertiser_id'])
#train_data = lgb.Dataset(X_train, label=y_train, feature_name=features, categorical_feature=c_features,free_raw_data=False)
train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=features,free_raw_data=False)
eval_data = lgb.Dataset(final_valid_x_df, label=final_valid_y_df, feature_name=features,free_raw_data=False,reference=train_data)


In [4]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':2,
    'metric': 'multi_error',
    'num_leaves': 128,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                valid_sets=[train_data,eval_data],
                num_boost_round = 2000,
                early_stopping_rounds=50
               )
                #early_stopping_rounds=5)
gbm.save_model('gender_emb.txt')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.330718	valid_1's multi_error: 0.329294
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_error: 0.330718	valid_1's multi_error: 0.329294
[3]	training's multi_error: 0.330424	valid_1's multi_error: 0.329
[4]	training's multi_error: 0.243329	valid_1's multi_error: 0.243189
[5]	training's multi_error: 0.209054	valid_1's multi_error: 0.209561
[6]	training's multi_error: 0.191026	valid_1's multi_error: 0.193028
[7]	training's multi_error: 0.179172	valid_1's multi_error: 0.181028
[8]	training's multi_error: 0.169614	valid_1's multi_error: 0.171806
[9]	training's multi_error: 0.162981	valid_1's multi_error: 0.165444
[10]	training's multi_error: 0.157478	valid_1's multi_error: 0.160256
[11]	training's multi_error: 0.153446	valid_1's multi_error: 0.155967
[12]	training's multi_error: 0.150264	valid_1's multi_error: 0.153161
[13]	training's multi_error: 0.147439	valid_1's multi_error: 0.150344
[14]	training's mul

[118]	training's multi_error: 0.101394	valid_1's multi_error: 0.113494
[119]	training's multi_error: 0.101281	valid_1's multi_error: 0.113506
[120]	training's multi_error: 0.101163	valid_1's multi_error: 0.113472
[121]	training's multi_error: 0.101033	valid_1's multi_error: 0.113433
[122]	training's multi_error: 0.100919	valid_1's multi_error: 0.113333
[123]	training's multi_error: 0.100822	valid_1's multi_error: 0.113194
[124]	training's multi_error: 0.100668	valid_1's multi_error: 0.113172
[125]	training's multi_error: 0.100596	valid_1's multi_error: 0.113211
[126]	training's multi_error: 0.10044	valid_1's multi_error: 0.113183
[127]	training's multi_error: 0.100293	valid_1's multi_error: 0.113006
[128]	training's multi_error: 0.100143	valid_1's multi_error: 0.112978
[129]	training's multi_error: 0.10005	valid_1's multi_error: 0.112939
[130]	training's multi_error: 0.0999194	valid_1's multi_error: 0.112806
[131]	training's multi_error: 0.0997514	valid_1's multi_error: 0.112767
[132]	

[233]	training's multi_error: 0.0893111	valid_1's multi_error: 0.109878
[234]	training's multi_error: 0.0892194	valid_1's multi_error: 0.109972
[235]	training's multi_error: 0.0891014	valid_1's multi_error: 0.109867
[236]	training's multi_error: 0.0890764	valid_1's multi_error: 0.1098
[237]	training's multi_error: 0.0889972	valid_1's multi_error: 0.109833
[238]	training's multi_error: 0.088925	valid_1's multi_error: 0.1098
[239]	training's multi_error: 0.0887597	valid_1's multi_error: 0.10985
[240]	training's multi_error: 0.0886861	valid_1's multi_error: 0.109911
[241]	training's multi_error: 0.0886056	valid_1's multi_error: 0.109928
[242]	training's multi_error: 0.0885417	valid_1's multi_error: 0.10995
[243]	training's multi_error: 0.0884792	valid_1's multi_error: 0.109939
[244]	training's multi_error: 0.0883847	valid_1's multi_error: 0.109889
[245]	training's multi_error: 0.0883042	valid_1's multi_error: 0.109867
[246]	training's multi_error: 0.0882528	valid_1's multi_error: 0.109922

[348]	training's multi_error: 0.0799236	valid_1's multi_error: 0.109161
[349]	training's multi_error: 0.0798778	valid_1's multi_error: 0.109083
[350]	training's multi_error: 0.0798222	valid_1's multi_error: 0.109078
[351]	training's multi_error: 0.0797417	valid_1's multi_error: 0.109083
[352]	training's multi_error: 0.0796639	valid_1's multi_error: 0.109067
[353]	training's multi_error: 0.0796083	valid_1's multi_error: 0.109083
[354]	training's multi_error: 0.0794861	valid_1's multi_error: 0.109144
[355]	training's multi_error: 0.0793833	valid_1's multi_error: 0.109111
[356]	training's multi_error: 0.0793347	valid_1's multi_error: 0.109133
[357]	training's multi_error: 0.0792917	valid_1's multi_error: 0.109072
[358]	training's multi_error: 0.0792042	valid_1's multi_error: 0.109072
[359]	training's multi_error: 0.0790903	valid_1's multi_error: 0.109094
[360]	training's multi_error: 0.0790292	valid_1's multi_error: 0.109022
[361]	training's multi_error: 0.0789833	valid_1's multi_error: 0

[463]	training's multi_error: 0.0712347	valid_1's multi_error: 0.10905
[464]	training's multi_error: 0.0711389	valid_1's multi_error: 0.108994
[465]	training's multi_error: 0.0710361	valid_1's multi_error: 0.108983
[466]	training's multi_error: 0.0709778	valid_1's multi_error: 0.109011
[467]	training's multi_error: 0.0709306	valid_1's multi_error: 0.108983
[468]	training's multi_error: 0.0708347	valid_1's multi_error: 0.109067
[469]	training's multi_error: 0.0707639	valid_1's multi_error: 0.108983
[470]	training's multi_error: 0.0706958	valid_1's multi_error: 0.108994
[471]	training's multi_error: 0.0706208	valid_1's multi_error: 0.108983
[472]	training's multi_error: 0.0705528	valid_1's multi_error: 0.108906
[473]	training's multi_error: 0.0704833	valid_1's multi_error: 0.108917
[474]	training's multi_error: 0.0704042	valid_1's multi_error: 0.108822
[475]	training's multi_error: 0.0703556	valid_1's multi_error: 0.108811
[476]	training's multi_error: 0.0702806	valid_1's multi_error: 0.

In [5]:
from sklearn.preprocessing import  OneHotEncoder

before_one_hot =  final_train_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [0]]
(720000, 2)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
y_pred = gbm.predict(final_train_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)
print(one_hoted_y)
classification_report(one_hoted_y, y_pred)
precision_score(one_hoted_y, y_pred,average='micro')

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]


0.9266277777777778

In [7]:
ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df)
print(ret_df['predicted_age'].value_counts())

        user_id  predicted_age
0             0              1
1             1              1
2             2              1
3             3              2
4             4              1
5             5              1
6             6              1
7             7              1
8             8              2
9             9              1
...         ...            ...
719990   719990              1
719991   719991              1
719992   719992              1
719993   719993              1
719994   719994              1
719995   719995              2
719996   719996              1
719997   719997              2
719998   719998              1
719999   719999              1

[720000 rows x 2 columns]
1    492263
2    227737
Name: predicted_age, dtype: int64
