In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
sys.path.append('../')

from config import *
import pandas as pd
from tools import *
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
from scipy import sparse
import matplotlib.pyplot as plt
import scipy

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
TARGET = 'gender'

In [34]:
### target
target = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
target = target.groupby(['user_id']).agg('first').reset_index()

## 点击list TFIDF

In [35]:
def get_tfidf(feat):
    tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[feat])
    ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT[feat])

    click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)[feat].values.tolist()

    tfidf = TfidfVectorizer(tokenizer=lambda x: x, max_features=100000, max_df=0.9, min_df=30, lowercase=False)
    tfidf_user = tfidf.fit_transform(click_seq)

    tfidf_user = tfidf_user.astype('float32')
    return tfidf_user

In [36]:
feature_columns = ['time', 'creative_id', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']

In [37]:
tfidf_dict = dict()
for feat in feature_columns:
    tfidf_dict[feat] = get_tfidf(feat)

In [38]:
for feat in feature_columns:
    scipy.sparse.save_npz(TRAIN_DIR+'tfidf_%s.npz'%feat, tfidf_dict[feat])

### Stacking Feature

In [39]:
temp1 = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
temp2 = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV

In [41]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)

In [42]:
train_df = temp1[['user_id']]
test_df = temp2[['user_id']]

In [43]:
STACKING_FEAT = ['lr', 'sgd', 'pac', 'rc', 'bnb', 'mnb']
STACKING_MODEL = [LogisticRegression(n_jobs=6), 
                  CalibratedClassifierCV(SGDClassifier(early_stopping=True, n_jobs=6)),
                  PassiveAggressiveClassifier(early_stopping=True, n_jobs=6), 
                  RidgeClassifier(),
                  BernoulliNB(), 
                  MultinomialNB()]
STACKING_MODEL = dict(zip(STACKING_FEAT, STACKING_MODEL))

In [44]:
def get_stacking_tfidf_feat(feat):
    tfidf_user = tfidf_dict[feat]

    train_csr = tfidf_user[:900000]
    test_csr  = tfidf_user[900000:]

    X_train, Y_train = train_csr, target[TARGET]-1
    X_test = test_csr

    for mode in STACKING_FEAT:
        if TARGET == 'age':
            for i in range(10):
                train_df[TARGET+'_'+mode+'_'+feat+'_'+str(i)] = 0.
                test_df[TARGET+'_'+mode+'_'+feat+'_'+str(i)] = 0.   
            predict_age = np.zeros((len(test_df), 10))
        else:
            train_df[TARGET+'_'+mode+'_'+feat] = 0.
            test_df[TARGET+'_'+mode+'_'+feat] = 0.                
            predict_age = np.zeros((len(test_df), ))

        for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
            print("Feature: {}, Model: {}, fold n°{}".format(feat, mode, fold_+1))
            x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
            x_val, y_val = X_train[val_idx], Y_train[val_idx]

            model = STACKING_MODEL[mode]
            model.fit(x_train, y_train)

            r = model.score(x_val, y_val)
            print("accuracy:", r)

            if TARGET == 'age':
                try:
                    train_df.loc[val_idx, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = model.predict_proba(x_val)
                    predict_age += model.predict_proba(X_test)
                except AttributeError:
                    train_df.loc[val_idx, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = model.decision_function(x_val)
                    predict_age += model.decision_function(X_test)
            else:
                try:
                    train_df.loc[val_idx, ['%s_%s_%s'% (TARGET, mode, feat)]] = model.predict_proba(x_val)[:, 0]
                    predict_age += model.predict_proba(X_test)[:, 0]
                except AttributeError:
                    train_df.loc[val_idx, ['%s_%s_%s'% (TARGET, mode, feat)]] = model.decision_function(x_val)
                    predict_age += model.decision_function(X_test)

        if TARGET == 'age':
            test_df.loc[:, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = predict_age/N_SPLITS
        else:
            test_df.loc[:, ['%s_%s_%s'% (TARGET, mode, feat)]] = predict_age/N_SPLITS

In [45]:
for feat in feature_columns:
    get_stacking_tfidf_feat(feat)

Feature: time, Model: lr, fold n°1
accuracy: 0.6686222222222222
Feature: time, Model: lr, fold n°2
accuracy: 0.6710166666666667
Feature: time, Model: lr, fold n°3
accuracy: 0.6699944444444444
Feature: time, Model: lr, fold n°4
accuracy: 0.6692166666666667
Feature: time, Model: lr, fold n°5
accuracy: 0.6689833333333334
Feature: time, Model: sgd, fold n°1
accuracy: 0.6686222222222222
Feature: time, Model: sgd, fold n°2
accuracy: 0.6710166666666667
Feature: time, Model: sgd, fold n°3
accuracy: 0.6699944444444444
Feature: time, Model: sgd, fold n°4
accuracy: 0.6692166666666667
Feature: time, Model: sgd, fold n°5
accuracy: 0.6689833333333334
Feature: time, Model: pac, fold n°1
accuracy: 0.6161944444444445
Feature: time, Model: pac, fold n°2
accuracy: 0.6414333333333333
Feature: time, Model: pac, fold n°3
accuracy: 0.5895277777777778
Feature: time, Model: pac, fold n°4
accuracy: 0.5227166666666667
Feature: time, Model: pac, fold n°5
accuracy: 0.6196888888888888
Feature: time, Model: rc, fold

accuracy: 0.6757944444444445
Feature: product_category, Model: lr, fold n°4
accuracy: 0.6752944444444444
Feature: product_category, Model: lr, fold n°5
accuracy: 0.6755277777777777
Feature: product_category, Model: sgd, fold n°1
accuracy: 0.6699333333333334
Feature: product_category, Model: sgd, fold n°2
accuracy: 0.6725
Feature: product_category, Model: sgd, fold n°3
accuracy: 0.6712833333333333
Feature: product_category, Model: sgd, fold n°4
accuracy: 0.6706888888888889
Feature: product_category, Model: sgd, fold n°5
accuracy: 0.6710277777777778
Feature: product_category, Model: pac, fold n°1
accuracy: 0.6503277777777777
Feature: product_category, Model: pac, fold n°2
accuracy: 0.6503166666666667
Feature: product_category, Model: pac, fold n°3
accuracy: 0.6517944444444445
Feature: product_category, Model: pac, fold n°4
accuracy: 0.6725888888888889
Feature: product_category, Model: pac, fold n°5
accuracy: 0.6622888888888889
Feature: product_category, Model: rc, fold n°1
accuracy: 0.67

In [51]:
train_df

Unnamed: 0_level_0,gender_lr_time,gender_sgd_time,gender_pac_time,gender_rc_time,gender_bnb_time,gender_mnb_time,gender_lr_creative_id,gender_sgd_creative_id,gender_pac_creative_id,gender_rc_creative_id,...,gender_pac_advertiser_id,gender_rc_advertiser_id,gender_bnb_advertiser_id,gender_mnb_advertiser_id,gender_lr_industry,gender_sgd_industry,gender_pac_industry,gender_rc_industry,gender_bnb_industry,gender_mnb_industry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.648696,0.668930,0.944642,-0.297491,0.462687,0.671713,0.835503,0.892835,-1.015039,-0.688012,...,0.300353,-0.350326,1.262292e-01,0.631968,0.283545,0.283109,1.686654,0.270643,0.059902,0.495623
2,0.669548,0.664804,0.134110,-0.337093,0.762594,0.668370,0.909430,0.974811,-1.134497,-0.843744,...,-2.030304,-0.598583,9.995696e-01,0.875381,0.962025,0.970992,-2.838671,-0.836342,0.953354,0.792184
3,0.669105,0.672787,-0.619038,-0.337123,0.677538,0.665693,0.020077,0.042654,2.920288,1.209945,...,6.451700,1.565887,2.756222e-14,0.004059,0.156557,0.188042,0.532648,0.297195,0.120992,0.463241
4,0.653514,0.667255,-0.296040,-0.306282,0.556943,0.672132,0.846787,0.776120,-1.517223,-0.313161,...,-2.943552,-0.308591,9.708748e-01,0.801061,0.626361,0.592239,0.791519,-0.137615,0.778883,0.622790
5,0.654967,0.670437,-1.566085,-0.308868,0.714468,0.664008,0.984522,0.916422,-4.161588,-0.600071,...,-3.656198,-0.748906,9.999999e-01,0.949030,0.878941,0.868572,-0.194567,-0.662066,0.904946,0.731858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899996,0.680420,0.668328,0.318026,-0.360206,0.524115,0.675844,0.994958,0.967098,-5.865822,-1.064993,...,-3.908812,-0.989946,9.999829e-01,0.986726,0.995254,0.992567,-3.503454,-1.055025,0.972449,0.933512
899997,0.668221,0.669131,0.134251,-0.335602,0.547137,0.671363,0.284075,0.127350,0.079271,0.223498,...,0.059045,0.289862,2.591802e-03,0.388146,0.795320,0.818283,-2.211360,-0.469287,0.425104,0.707043
899998,0.628080,0.663903,0.236978,-0.256817,0.457240,0.664080,0.001383,0.000007,5.494359,1.677639,...,1.179481,0.288408,3.063951e-03,0.247355,0.236660,0.209853,0.741282,0.317723,0.245038,0.508796
899999,0.698420,0.670689,-1.408075,-0.396776,0.623190,0.676225,0.693170,0.309457,-1.924441,-0.435933,...,-3.170398,-0.585541,9.936428e-01,0.848625,0.727672,0.755108,1.121421,-0.406496,0.724844,0.646590


In [52]:
test_df

Unnamed: 0_level_0,gender_lr_time,gender_sgd_time,gender_pac_time,gender_rc_time,gender_bnb_time,gender_mnb_time,gender_lr_creative_id,gender_sgd_creative_id,gender_pac_creative_id,gender_rc_creative_id,...,gender_pac_advertiser_id,gender_rc_advertiser_id,gender_bnb_advertiser_id,gender_mnb_advertiser_id,gender_lr_industry,gender_sgd_industry,gender_pac_industry,gender_rc_industry,gender_bnb_industry,gender_mnb_industry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000001,0.611988,0.665523,-0.278999,-0.227355,0.401027,0.665306,0.961757,0.878129,-3.312299,-0.844437,...,-4.412609,-0.740402,9.999997e-01,0.991810,0.937451,0.918459,-1.553361,-0.477851,0.966735,0.811494
3000002,0.674952,0.670754,-0.676000,-0.348080,0.843889,0.662736,0.000504,0.000009,6.784782,2.270311,...,3.457347,1.131524,7.087798e-11,0.066742,0.174078,0.181643,1.077396,0.376419,0.004175,0.426089
3000003,0.673212,0.669087,-0.109545,-0.345335,0.687963,0.669517,0.787192,0.878595,-1.934049,-0.784078,...,-0.129640,-0.292728,5.231506e-01,0.644594,0.618586,0.665233,-0.238797,-0.254284,0.518327,0.643566
3000004,0.685008,0.669606,-0.314923,-0.369650,0.581657,0.677478,0.674343,0.879992,-0.621173,-0.276192,...,-1.473468,-0.685793,9.650879e-01,0.841881,0.728553,0.775987,-0.086203,-0.442695,0.723972,0.708266
3000005,0.696163,0.672596,-1.130813,-0.391934,0.721877,0.672531,0.925923,0.901521,-2.510141,-0.634478,...,-4.314632,-1.162091,9.999999e-01,0.974483,0.436384,0.393639,0.601582,0.068277,0.486531,0.621512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999996,0.667522,0.669935,-0.593991,-0.333733,0.774528,0.668007,0.981350,0.975382,-3.325256,-0.650442,...,-3.485233,-0.644348,1.000000e+00,0.980872,0.674566,0.665364,0.120904,-0.227713,0.881799,0.668348
3999997,0.678694,0.670770,-0.536313,-0.356681,0.596061,0.673025,0.936227,0.943127,-2.986435,-0.685637,...,-0.571536,-0.041303,9.957324e-01,0.869504,0.778683,0.788739,-0.721161,-0.303063,0.860273,0.666850
3999998,0.682950,0.669900,-0.304855,-0.364970,0.614546,0.673231,0.991494,0.975029,-3.090100,-0.571941,...,-4.558176,-1.070603,9.995602e-01,0.928692,0.918071,0.903062,-1.829606,-0.650254,0.974984,0.808529
3999999,0.739746,0.673128,-1.297512,-0.486179,0.913488,0.680087,0.998463,0.975919,-7.270050,-1.183701,...,-5.963000,-1.209639,1.000000e+00,0.998060,0.991472,0.986795,-3.502149,-1.074314,0.999982,0.912866


In [54]:
# train_df = train_df.set_index('user_id')
# test_df = test_df.set_index('user_id')

In [55]:
train_df.to_pickle(TRAIN_DIR+'tfidf_stack_%s.pkl'%TARGET)

In [56]:
test_df.to_pickle(TEST_DIR+'tfidf_stack_%s.pkl'%TARGET)

## 统计特征

In [None]:
### Per Day Click Times
tr_per_day_clk_times = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['per_day_click'])
ts_per_day_clk_times = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['per_day_click'])

### Kfold Target Encode
tr_kfold_te = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['kfold_te'])
ts_kfold_te = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['kfold_te'])

### Sequence Statistic Feature
tr_seq_statistic = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['seq_statistic'])
ts_seq_statistic = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['seq_statistic'])

### Countvec 特征

In [None]:
tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

assert tr_click_seq['user_id'].values.tolist() == train_df.index.values.tolist()
assert ts_click_seq['user_id'].values.tolist() == test_df.index.values.tolist()

click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)['ad_id'].values.tolist()

cntv = CountVectorizer(tokenizer=lambda x: x, max_df=0.9, min_df=30, lowercase=False)

cntv_user = cntv.fit_transform(click_seq)

cntv_user = cntv_user.astype('float32')

In [None]:
scipy.sparse.save_npz(TRAIN_DIR+'countvec.npz', cntv_user)

### Merge

In [None]:
train_df = pd.concat([tr_per_day_clk_times, tr_kfold_te, tr_seq_statistic, train_df], axis=1)
test_df = pd.concat([ts_per_day_clk_times, ts_kfold_te, ts_seq_statistic, test_df], axis=1)

In [None]:
train_csr = sparse.csr_matrix(train_df.values)
test_csr = sparse.csr_matrix(test_df.values)

In [None]:
train_csr = sparse.hstack((train_csr, cntv_user[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , cntv_user[900000:])).tocsr()

In [None]:
X_train, y_train = train_csr, target['age']-1
X_test = test_csr

# Metric

In [None]:
def accuracy(y_true, y_pred):
    assert len(y_true) == len(y_pred), "length of y_true and y_pred not equal"
    total_example = len(y_true)
    right_cnt = 0
    for t, p in zip(y_true, y_pred):
        if t == p:
            right_cnt += 1
    return right_cnt / total_example

# Model

In [None]:
param = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 10,  
    'metric': ['multi_error'],  
    'num_leaves': 2**9,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':6,
    'n_estimators': 1000
}

In [None]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 200)
    
    y_val_pred = clf.predict(X_train[val_idx])
    y_val_pred = np.argmax(y_val_pred,axis=-1).tolist()
    acc = accuracy(y_train[val_idx].values.tolist(), y_val_pred)
    print("kfold: {:d}, accuracy: {:.4f}".format(fold_+1, acc))
    
    break