In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
sys.path.append('../')

from config import *
import pandas as pd
from tools import *
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
from scipy import sparse
import matplotlib.pyplot as plt
import scipy

In [None]:
TARGET = 'age'

In [None]:
### target
target = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
target = target.groupby(['user_id']).agg('first').reset_index()

## 统计特征

In [None]:
### Per Day Click Times
tr_per_day_clk_times = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['per_day_click'])
ts_per_day_clk_times = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['per_day_click'])

### Kfold Target Encode
tr_kfold_te = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['kfold_te_%s'%TARGET])
ts_kfold_te = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['kfold_te_%s'%TARGET])

### Sequence Statistic Feature
tr_seq_statistic = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['seq_statistic'])
ts_seq_statistic = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['seq_statistic'])

## 点击list TFIDF

In [None]:
tfidf_creative_id = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_creative_id'])
tfidf_ad_id = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_ad_id'])
tfidf_product_id = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_product_id'])
tfidf_product_category = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_product_category'])
tfidf_advertiser_id = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_advertiser_id'])
tfidf_industry = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_industry'])
tfidf_time = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['tfidf_time'])

In [None]:
tr_tfidf_stacking = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['tfidf_stack_%s'%TARGET])
ts_tfidf_stacking = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['tfidf_stack_%s'%TARGET])

## 点击list Countvec

In [None]:
creative_id_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['creative_id_cntv_user'])
ad_id_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['ad_id_cntv_user'])
product_id_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['product_id_cntv_user'])
product_category_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['product_category_cntv_user'])
advertiser_id_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['advertiser_id_cntv_user'])
industry_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['industry_cntv_user'])
time_cntv_user = sparse.load_npz(TRAIN_DIR+CLK_PATH_DICT['time_cntv_user'])

### Merge

In [None]:
train_df = pd.concat([tr_per_day_clk_times, tr_kfold_te, tr_seq_statistic, tr_tfidf_stacking], axis=1)
test_df = pd.concat([ts_per_day_clk_times, ts_kfold_te, ts_seq_statistic, ts_tfidf_stacking], axis=1)

In [None]:
train_csr = sparse.csr_matrix(train_df.values)
test_csr = sparse.csr_matrix(test_df.values)

In [None]:
train_csr = sparse.hstack((train_csr, 
                           tfidf_creative_id[:900000],
                           tfidf_ad_id[:900000],
                           tfidf_product_id[:900000],
                           tfidf_product_category[:900000],
                           tfidf_advertiser_id[:900000],
                           tfidf_industry[:900000],
                           tfidf_time[:900000],
                           
                           creative_id_cntv_user[:900000],
                           ad_id_cntv_user[:900000],
                           product_id_cntv_user[:900000],
                           product_category_cntv_user[:900000],
                           advertiser_id_cntv_user[:900000],
                           industry_cntv_user[:900000],
                           time_cntv_user[:900000])).tocsr()

In [None]:
X_train, y_train = train_csr, target[TARGET]-1

# Metric

In [None]:
def accuracy(y_true, y_pred):
    assert len(y_true) == len(y_pred), "length of y_true and y_pred not equal"
    total_example = len(y_true)
    right_cnt = 0
    for t, p in zip(y_true, y_pred):
        if t == p:
            right_cnt += 1
    return right_cnt / total_example

# Model

In [None]:
param = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 10,  
    'metric': ['multi_error'],  
    'num_leaves': 2**9,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':10,
    'n_estimators': 1000
}

In [None]:
models = []
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 200)
    
    models.append(clf)
    y_val_pred = clf.predict(X_train[val_idx])
    y_val_pred = np.argmax(y_val_pred,axis=-1).tolist()
    acc = accuracy(y_train[val_idx].values.tolist(), y_val_pred)
    print("kfold: {:d}, accuracy: {:.4f}".format(fold_+1, acc))