In [17]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [5]:
train_df =pd.read_pickle(f'{preprocess_path}/train_user.pkl')
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
train_df = train_df.merge(label_df,on='user_id')
train_df['age'] = train_df['age'] -1
train_df = train_df.astype(float)
print(train_df)

         user_id  total_clks  active_days  day_clks_max  day_clks_min  day_clks_mean  day_clks_std  week_clks_max  week_clks_min  week_clks_mean  week_clks_std  weekend_clks_max  weekend_clks_min  weekend_clks_mean  weekend_clks_std  workday_clks_max  workday_clks_min  \
0            1.0        14.0         10.0           3.0           1.0       1.300000      0.674949            3.0            1.0        2.166667       0.983192               2.0               1.0               1.50          0.707107               3.0               1.0   
1            2.0        46.0         28.0           4.0           1.0       1.607143      0.994030            7.0            1.0        4.090909       2.256304               1.0               1.0               1.00          0.000000               7.0               1.0   
2            3.0        30.0         23.0           3.0           1.0       1.304348      0.634950            6.0            1.0        3.000000       1.763834               2.0       

In [34]:
x = train_df.drop(['age','user_id','gender',], axis=1)
y = train_df['age'].astype(int)
print(x)
print(y)

train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



0         3
1         9
2         6
         ..
838468    3
838469    2
838470    2
Name: age, Length: 838471, dtype: int64
(670776, 22)


In [32]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 128,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                 valid_sets=[train_data,valid_data],
                num_boost_round = 2000,
                early_stopping_rounds=100
               )
gbm.save_model('age_emb1.txt')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.775163	valid_1's multi_error: 0.773977
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.775156	valid_1's multi_error: 0.773977
[3]	training's multi_error: 0.775154	valid_1's multi_error: 0.773977
[4]	training's multi_error: 0.775098	valid_1's multi_error: 0.773988
[5]	training's multi_error: 0.775043	valid_1's multi_error: 0.773988
[6]	training's multi_error: 0.774949	valid_1's multi_error: 0.773994
[7]	training's multi_error: 0.774804	valid_1's multi_error: 0.773983
[8]	training's multi_error: 0.774642	valid_1's multi_error: 0.773977
[9]	training's multi_error: 0.774402	valid_1's multi_error: 0.773887
[10]	training's multi_error: 0.774154	valid_1's multi_error: 0.773839
[11]	training's multi_error: 0.773886	valid_1's multi_error: 0.773822
[12]	training's multi_error: 0.773608	valid_1's multi_error: 0.77381
[13]	training's multi_error: 0.773252	valid_1's multi_error: 0.773774
[14]	training's 

[118]	training's multi_error: 0.737031	valid_1's multi_error: 0.775259
Early stopping, best iteration is:
[18]	training's multi_error: 0.771415	valid_1's multi_error: 0.773666
Feature importances: [1564, 1304, 448, 6, 1947, 1642, 611, 207, 1403, 1885, 289, 113, 1041, 1123, 564, 182, 1567, 1776, 1092, 771, 1530, 1795]


In [None]:
before_one_hot =  final_train_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(final_train_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

In [None]:

before_one_hot =  final_valid_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(final_valid_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())