In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [60]:
train_df =pd.read_pickle('train1.pkl')
train_df.replace("\\N",-1,inplace=True)
train_df=train_df.astype(float,inplace=True)
train_df['age']  = train_df['age'] -1
print(train_df)

In [63]:
final_train_x_df = train_df.drop(['age','user_id','gender'], axis=1)
#final_train_x_df = train_df.drop(['age','user_id','gender','active_days'], axis=1)
final_train_y_df = train_df['age']
X_train, X_test, y_train, y_test = train_test_split(final_train_x_df, final_train_y_df, test_size=0.20, random_state=42)
#train_data_show_df =train_df.drop(['active_days', 'click_times_total','max_clicked_industry_cnt','clicked_industry','clicked_advertiser','max_clicked_advertiser_cnt','max_clicked_industry_ratio','max_clicked_advertiser_ratio'], axis=1)
#print(X_train)
num_normal_features = ['_clicks_max_click_cnt','_max_clicked_ratio','_clicks_min_click_cnt','_min_clicked_ratio','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_date_features  = [ '_clicks_max_click_cnt', '_clicks_min_click_cnt','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_features = ['click_times_total'] +\
                [f'date{i}'  for i in num_date_features] + \
                [f'wday{i}'  for i in num_date_features] + \
                [f'month{i}'  for i in num_date_features] + \
                 [f'product_id{i}'  for i in num_normal_features] + \
                 [f'product_category{i}'  for i in num_normal_features] + \
                [f'industry{i}'  for i in num_normal_features] + \
                [f'advertiser_id{i}'  for i in num_normal_features]

#print(num_features)

c_features = ['industry_clicks_max_click','industry_clicks_min_click',
              'advertiser_id_clicks_max_click','advertiser_id_clicks_min_click',
              'product_id_clicks_max_click','product_id_clicks_min_click',
              'product_category_clicks_max_click','product_category_clicks_min_click',
             ]
features= num_features + c_features
features= [f"tfidf_{i}" for i in range(317)] +['active_days','click_times_total']
#train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=[   'max_clicked_industry', 'max_clicked_advertiser_id' ], categorical_feature=['max_clicked_industry','max_clicked_advertiser_id'])
#train_data = lgb.Dataset(X_train, label=y_train, feature_name=features, categorical_feature=c_features,free_raw_data=False)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=features,free_raw_data=False)

eval_data = lgb.Dataset(X_test, label=y_test, feature_name=features, categorical_feature=c_features,free_raw_data=False,reference=train_data)


In [64]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 256,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                valid_sets=[train_data,eval_data],
                num_boost_round = 1000,
                early_stopping_rounds=100
               )
                #early_stopping_rounds=5)

print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.774357	valid_1's multi_error: 0.774978
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.774125	valid_1's multi_error: 0.774861
[3]	training's multi_error: 0.772618	valid_1's multi_error: 0.773894
[4]	training's multi_error: 0.768843	valid_1's multi_error: 0.771444
[5]	training's multi_error: 0.763867	valid_1's multi_error: 0.768667
[6]	training's multi_error: 0.758508	valid_1's multi_error: 0.765206
[7]	training's multi_error: 0.751949	valid_1's multi_error: 0.761106
[8]	training's multi_error: 0.744914	valid_1's multi_error: 0.756494
[9]	training's multi_error: 0.738356	valid_1's multi_error: 0.751972
[10]	training's multi_error: 0.732706	valid_1's multi_error: 0.7488
[11]	training's multi_error: 0.726985	valid_1's multi_error: 0.745194
[12]	training's multi_error: 0.72175	valid_1's multi_error: 0.742317
[13]	training's multi_error: 0.717508	valid_1's multi_error: 0.7402
[14]	training's mult

[118]	training's multi_error: 0.570304	valid_1's multi_error: 0.711756
[119]	training's multi_error: 0.569528	valid_1's multi_error: 0.711917
[120]	training's multi_error: 0.568728	valid_1's multi_error: 0.711789
[121]	training's multi_error: 0.567656	valid_1's multi_error: 0.711656
[122]	training's multi_error: 0.56679	valid_1's multi_error: 0.711622
[123]	training's multi_error: 0.565703	valid_1's multi_error: 0.711717
[124]	training's multi_error: 0.564672	valid_1's multi_error: 0.711778
[125]	training's multi_error: 0.563907	valid_1's multi_error: 0.711633
[126]	training's multi_error: 0.562856	valid_1's multi_error: 0.711583
[127]	training's multi_error: 0.561967	valid_1's multi_error: 0.711533
[128]	training's multi_error: 0.560956	valid_1's multi_error: 0.711561
[129]	training's multi_error: 0.56015	valid_1's multi_error: 0.711644
[130]	training's multi_error: 0.559265	valid_1's multi_error: 0.71175
[131]	training's multi_error: 0.558307	valid_1's multi_error: 0.7118
[132]	train

In [65]:
from sklearn.preprocessing import  OneHotEncoder

before_one_hot =  y_train.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[2.]
 [1.]
 [3.]
 ...
 [5.]
 [2.]
 [6.]]
(720000, 10)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [None]:
y_pred = gbm.predict(X_train)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)
print(one_hoted_y)
classification_report(one_hoted_y, y_pred)
precision_score(one_hoted_y, y_pred,average='micro')

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
