In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
sys.path.append('../')

from config import *
import pandas as pd
from tools import *

import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
from scipy import sparse

from multiprocessing import Pool 
from multiprocessing import cpu_count

In [2]:
tr_user_log = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
tr_ad_info = pd.read_pickle(TRAIN_DIR+AD_INFO_PATH)

In [3]:
ts_user_log = pd.read_pickle(TEST_DIR+USER_LOG_PATH)
ts_ad_info = pd.read_pickle(TEST_DIR+AD_INFO_PATH)

In [4]:
### target
AGE = tr_user_log.groupby(['user_id'])['age'].agg('first')
GENDER = tr_user_log.groupby(['user_id'])['gender'].agg('first')

In [5]:
assert tr_user_log['creative_id'].values.tolist() == tr_ad_info['creative_id'].values.tolist()
grid_df = pd.concat([tr_user_log, tr_ad_info[['ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']]], axis=1)
assert ts_user_log['creative_id'].values.tolist() == ts_ad_info['creative_id'].values.tolist()
grid_df_test = pd.concat([ts_user_log, ts_ad_info[['ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']]], axis=1)

# 统计特征

##### 点击每天特征

In [6]:
tr_clk_time_per_day = tr_user_log.groupby(['user_id', 'time']).agg({'click_times': np.sum}).reset_index()
tr_clk_time_per_day = pd.pivot_table(tr_clk_time_per_day, index='user_id', columns='time', values='click_times')
tr_clk_time_per_day.columns = ['time_'+str(col) for col in tr_clk_time_per_day.columns]

tr_clk_time_per_day = tr_clk_time_per_day.fillna(0)

In [28]:
tr_clk_time_per_day.to_pickle(TRAIN_DIR+'per_day_click.pkl')

In [7]:
ts_clk_time_per_day = ts_user_log.groupby(['user_id', 'time']).agg({'click_times': np.sum}).reset_index()
ts_clk_time_per_day = pd.pivot_table(ts_clk_time_per_day, index='user_id', columns='time', values='click_times')
ts_clk_time_per_day.columns = ['time_'+str(col) for col in ts_clk_time_per_day.columns]

ts_clk_time_per_day = ts_clk_time_per_day.fillna(0)

In [29]:
ts_clk_time_per_day.to_pickle(TEST_DIR+'per_day_click.pkl')

##### K-fold Target Encode

In [8]:
N_SPLITS = 5

In [9]:
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)

In [10]:
user_kfold = grid_df.groupby(['user_id']).agg('first').reset_index()
for fold_,(trn_idx,val_idx) in enumerate(folds.split(user_kfold, user_kfold)):
    user_kfold.loc[val_idx, 'fold'] = fold_

user_kfold = user_kfold[['user_id', 'fold']]

In [11]:
kfold_grid_df = grid_df.merge(user_kfold, on=['user_id'], how='left')

In [12]:
### kfold split target encode
##############################
def target_encode_df(tr_df, vl_df, feat, label='age', value='click_times'):
    if label == 'age':
        col_cnt = 10
    else:
        col_cnt = 2
    def target_encode(df, feat, label='age', value='click_times'):
        temp = df.groupby([feat, label]).agg({value: np.sum})
        temp_pcts = temp.groupby(level=0).apply(lambda x: x / float(x.sum()))
        temp_pcts = temp_pcts.unstack(level=1)
        temp_pcts = temp_pcts.droplevel(level=0, axis=1)
        temp_pcts.columns= [feat+"_"+label+'_rate'+str(i) for i in temp_pcts.columns]
        return temp_pcts

    temp_pcts = target_encode(tr_df, feat, label, value)
    temp_pcts = vl_df[['user_id', feat, value]].merge(temp_pcts, on=[feat], how='left').fillna(0)
    
    for i in range(1, col_cnt+1):
        temp_pcts[feat+"_"+label+'_rate'+str(i)] = temp_pcts[feat+"_"+label+'_rate'+str(i)] * temp_pcts[value]
        
    del temp_pcts[feat], temp_pcts[value]
    return temp_pcts

In [15]:
### get kfold target encode dataframe
##############################
def get_te_df(feat):
    te_dfs = []

    for i in range(N_SPLITS):
        tr_df = kfold_grid_df[kfold_grid_df['fold']!=i]
        vl_df = kfold_grid_df[kfold_grid_df['fold']==i]

        vl_df = target_encode_df(tr_df, vl_df, feat=feat)
        mean = vl_df.groupby(['user_id']).mean()
        mean.columns = [col+'_mean' for col in mean.columns]
        
        median = vl_df.groupby(['user_id']).median()
        median.columns = [col+'_median' for col in median.columns]
        
        m = vl_df.groupby(['user_id']).max()
        m.columns = [col+'_max' for col in m.columns]
        
        std = vl_df.groupby(['user_id']).std()
        std.columns = [col+'_std' for col in std.columns]

        te_dfs.append(pd.concat([mean, median, m, std], axis=1).reset_index())
    te_df = pd.concat(te_dfs, axis=0).sort_values(by=['user_id']).set_index('user_id')
    del te_dfs
    return te_df

In [16]:
pool = Pool(5)
t_split = ['time', 'product_id', 'product_category', 'advertiser_id', 'industry']
dfs = pool.map(get_te_df, t_split)
pool.close()
pool.join()

In [17]:
kfold_df = pd.concat(dfs, axis=1)

In [32]:
kfold_df.to_pickle(TRAIN_DIR+'kfold_te.pkl')

In [18]:
### test set
##################################
def target_encode_test(feat):
    te_df = target_encode_df(grid_df, grid_df_test, feat=feat)
    mean = te_df.groupby(['user_id']).mean()
    mean.columns = [col+'_mean' for col in mean.columns]

    median = te_df.groupby(['user_id']).median()
    median.columns = [col+'_median' for col in median.columns]

    m = te_df.groupby(['user_id']).max()
    m.columns = [col+'_max' for col in m.columns]

    std = te_df.groupby(['user_id']).std()
    std.columns = [col+'_std' for col in std.columns]

    return pd.concat([mean, median, m, std], axis=1).reset_index().sort_values(by=['user_id']).set_index('user_id')

In [19]:
pool = Pool(5)
t_split = ['time', 'product_id', 'product_category', 'advertiser_id', 'industry']
dfs = pool.map(target_encode_test, t_split)
pool.close()
pool.join()

In [20]:
kfold_df_test = pd.concat(dfs, axis=1)

In [33]:
kfold_df_test.to_pickle(TEST_DIR+'kfold_te.pkl')

#### 序列统计特征

In [22]:
## statistic feature
##########################
def get_nunique_feat(df, main_key, col):
    return df.groupby([main_key])[col].agg('nunique').rename(col+'_nunique')

def get_count_feat(df, main_key, col):
    return df.groupby([main_key])[col].agg('count').rename('log_cnt')

def get_sum_mean_max_min_std_feat(df, main_key, col):
    postfix = ['sum', 'mean','max','min','std']
    for i in range(len(postfix)):
        if i == 0:
            temp = df.groupby([main_key])[[col]].agg(postfix[i]).rename(columns={col: col+'_'+postfix[i]})
        else:
            temp[col+'_'+postfix[i]] = df.groupby([main_key])[col].agg(postfix[i])
    return temp

In [23]:
dfs = []
feature_columns = ['creative_id', 'time', 'ad_id', 'product_id' ,'product_category', 'advertiser_id', 'industry']
for col in feature_columns:
    dfs.append(get_nunique_feat(grid_df, 'user_id', col))
    
dfs.append(get_count_feat(grid_df, 'user_id', 'creative_id'))
dfs.append(get_sum_mean_max_min_std_feat(grid_df, 'user_id', 'click_times'))

In [24]:
seq_statistic_df = pd.concat(dfs, axis=1)

In [36]:
seq_statistic_df.to_pickle(TRAIN_DIR+'seq_statistic.pkl')

In [25]:
### test set
#################################
dfs = []
feature_columns = ['creative_id', 'time', 'ad_id', 'product_id' ,'product_category', 'advertiser_id', 'industry']
for col in feature_columns:
    dfs.append(get_nunique_feat(grid_df_test, 'user_id', col))
    
dfs.append(get_count_feat(grid_df_test, 'user_id', 'creative_id'))
dfs.append(get_sum_mean_max_min_std_feat(grid_df_test, 'user_id', 'click_times'))

In [26]:
seq_statistic_df_test = pd.concat(dfs, axis=1)

In [37]:
seq_statistic_df_test.to_pickle(TEST_DIR+'seq_statistic.pkl')

### 两两交叉统计特征

In [39]:
grid_df.columns.tolist()

['user_id',
 'age',
 'gender',
 'time',
 'creative_id',
 'click_times',
 'ad_id',
 'product_id',
 'product_category',
 'advertiser_id',
 'industry']

In [None]:
CROSS_FEATURES = grid_df.columns.tolist()
CROSS_FEATURES.remove('user_id')
CROSS_FEATURES.remove('age')
CROSS_FEATURES.remove('gender')
CROSS_FEATURES.remove('click_times')
CROSS_FEATURES.remove('creative_id')
CROSS_FEATURES.remove('ad_id')

for i in tqdm(range(len(CROSS_FEATURES)-1)):
    feat1 = CROSS_FEATURES[i]
    for feat2 in tqdm(CROSS_FEATURES[i+1:]):
        col_name = feat1+"_"+feat2
        grid_df[col_name] = grid_df[feat1].astype(str).values + '_' + grid_df[feat2].astype(str).values
        grid_df_ts[col_name] = grid_df_ts[feat1].astype(str).values + '_' + grid_df_ts[feat2].astype(str).values

dfs1 = []
dfs2 = []
cf_feat = [ 'time_product_id',
 'time_product_category',
 'time_advertiser_id',
 'time_industry',
 'product_id_product_category',
 'product_id_advertiser_id',
 'product_id_industry',
 'product_category_advertiser_id',
 'product_category_industry',
 'advertiser_id_industry']

for feat in cf_feat:
    dfs1.append(get_nunique_feat(grid_df, 'user_id', feat))
    dfs2.append(get_nunique_feat(grid_df_test, 'user_id', feat))
cf_df = pd.concat(dfs1, axis=1)
cf_df_ts = pd.concat(dfs2, axis=1)

## Merge all feature

In [None]:
train_df = pd.concat([tr_clk_time_per_day, seq_statistic_df, kfold_df, AGE], axis=1).reset_index()

In [None]:
test_df = pd.concat([ts_clk_time_per_day, seq_statistic_df_test, kfold_df_test], axis=1).reset_index()
test_user = test_df[['user_id']]
del test_df['user_id']

# Metric

In [None]:
def accuracy(y_true, y_pred):
    assert len(y_true) == len(y_pred), "length of y_true and y_pred not equal"
    total_example = len(y_true)
    right_cnt = 0
    for t, p in zip(y_true, y_pred):
        if t == p:
            right_cnt += 1
    return right_cnt / total_example

# Model

In [None]:
TARGET = 'age'

FEATURE_COLUMNS = train_df.columns.tolist()
FEATURE_COLUMNS.remove('user_id')
FEATURE_COLUMNS.remove(TARGET)

X_train, y_train = train_df[FEATURE_COLUMNS], train_df[TARGET]-1

In [None]:
param = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 10,  
    'metric': ['multi_error'],  
    'num_leaves': 2**9,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':6,
    'n_estimators': 1000
}

In [None]:
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 200)
    
    y_val_pred = clf.predict(X_train.iloc[val_idx])
    y_val_pred = np.argmax(y_val_pred,axis=-1).tolist()
    acc = accuracy(y_train[val_idx].values.tolist(), y_val_pred)
    print("kfold: {:d}, accuracy: {:.4f}".format(fold_+1, acc))
    
    break

In [None]:
SAVE_PATH = '/home/huangzc/competition/tencent/model_ckpt/lgb/model.txt'
clf.save_model(SAVE_PATH)

In [None]:
clf = lgb.Booster(model_file=SAVE_PATH)

In [None]:
predicted_age = clf.predict(test_df)

In [None]:
predicted_age = np.argmax(predicted_age, axis=-1) + 1

In [None]:
test_user['predicted_age'] = pd.Series(predicted_age)

In [None]:
test_user.to_csv(SUBMISSION_AGE_PATH, index=False)