In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
sys.path.append('../')

from config import *
import pandas as pd
from tools import *
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
from scipy import sparse
import matplotlib.pyplot as plt
import scipy

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
TARGET = 'gender'

In [4]:
### target
target = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
target = target.groupby(['user_id']).agg('first').reset_index()

## 点击list TFIDF

In [5]:
def get_tfidf(feat):
    tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[feat])
    ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT[feat])

    click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)[feat].values.tolist()

    tfidf = TfidfVectorizer(tokenizer=lambda x: x, max_features=100000, max_df=0.9, min_df=30, lowercase=False)
    tfidf_user = tfidf.fit_transform(click_seq)

    tfidf_user = tfidf_user.astype('float32')
    return tfidf_user

In [6]:
feature_columns = ['time', 'creative_id', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']

In [7]:
tfidf_dict = dict()
for feat in feature_columns:
    tfidf_dict[feat] = get_tfidf(feat)

In [8]:
for feat in feature_columns:
    scipy.sparse.save_npz(TRAIN_DIR+'tfidf_%s.npz'%feat, tfidf_dict[feat])

### Stacking Feature

In [9]:
temp1 = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
temp2 = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV

In [11]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)

In [16]:
train_df = temp1[['user_id']]
test_df = temp2[['user_id']]

In [17]:
STACKING_FEAT = ['lr', 'sgd', 'pac', 'rc', 'bnb', 'mnb']
STACKING_MODEL = [LogisticRegression(n_jobs=6), 
                  CalibratedClassifierCV(SGDClassifier(early_stopping=True, n_jobs=6)),
                  PassiveAggressiveClassifier(early_stopping=True, n_jobs=6), 
                  RidgeClassifier(),
                  BernoulliNB(), 
                  MultinomialNB()]
STACKING_MODEL = dict(zip(STACKING_FEAT, STACKING_MODEL))

In [18]:
def get_stacking_tfidf_feat(feat):
    tfidf_user = tfidf_dict[feat]

    train_csr = tfidf_user[:900000]
    test_csr  = tfidf_user[900000:]

    X_train, Y_train = train_csr, target[TARGET]-1
    X_test = test_csr

    for mode in STACKING_FEAT:
        if TARGET == 'age':
            for i in range(10):
                train_df[TARGET+'_'+mode+'_'+feat+'_'+str(i)] = 0.
                test_df[TARGET+'_'+mode+'_'+feat+'_'+str(i)] = 0.   
            predict_age = np.zeros((len(test_df), 10))
        else:
            train_df[TARGET+'_'+mode+'_'+feat] = 0.
            test_df[TARGET+'_'+mode+'_'+feat] = 0.                
            predict_age = np.zeros((len(test_df), ))

        for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
            print("Feature: {}, Model: {}, fold n°{}".format(feat, mode, fold_+1))
            x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
            x_val, y_val = X_train[val_idx], Y_train[val_idx]

            model = STACKING_MODEL[mode]
            model.fit(x_train, y_train)

            r = model.score(x_val, y_val)
            print("accuracy:", r)

            if TARGET == 'age':
                try:
                    train_df.loc[val_idx, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = model.predict_proba(x_val)
                    predict_age += model.predict_proba(X_test)
                except AttributeError:
                    train_df.loc[val_idx, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = model.decision_function(x_val)
                    predict_age += model.decision_function(X_test)
            else:
                try:
                    train_df.loc[val_idx, ['%s_%s_%s'% (TARGET, mode, feat)]] = model.predict_proba(x_val)[:, 0]
                    predict_age += model.predict_proba(X_test)[:, 0]
                except AttributeError:
                    train_df.loc[val_idx, ['%s_%s_%s'% (TARGET, mode, feat)]] = model.decision_function(x_val)
                    predict_age += model.decision_function(X_test)

        if TARGET == 'age':
            test_df.loc[:, ['%s_%s_%s_%d'% (TARGET, mode, feat, i) for i in range(10)]] = predict_age/N_SPLITS
        else:
            test_df.loc[:, ['%s_%s_%s'% (TARGET, mode, feat)]] = predict_age/N_SPLITS

In [19]:
for feat in feature_columns:
    get_stacking_tfidf_feat(feat)

Feature: time, Model: lr, fold n°1
accuracy: 0.6686222222222222
Feature: time, Model: lr, fold n°2
accuracy: 0.6710166666666667
Feature: time, Model: lr, fold n°3
accuracy: 0.6699944444444444
Feature: time, Model: lr, fold n°4
accuracy: 0.6692166666666667
Feature: time, Model: lr, fold n°5
accuracy: 0.6689833333333334
Feature: time, Model: sgd, fold n°1
accuracy: 0.6686222222222222
Feature: time, Model: sgd, fold n°2
accuracy: 0.6710166666666667
Feature: time, Model: sgd, fold n°3
accuracy: 0.6699944444444444
Feature: time, Model: sgd, fold n°4
accuracy: 0.6692166666666667
Feature: time, Model: sgd, fold n°5
accuracy: 0.6689833333333334
Feature: time, Model: pac, fold n°1
accuracy: 0.5686944444444444
Feature: time, Model: pac, fold n°2
accuracy: 0.5838
Feature: time, Model: pac, fold n°3
accuracy: 0.5260833333333333
Feature: time, Model: pac, fold n°4
accuracy: 0.6474333333333333
Feature: time, Model: pac, fold n°5
accuracy: 0.6497944444444445
Feature: time, Model: rc, fold n°1
accurac

accuracy: 0.6757888888888889
Feature: product_category, Model: lr, fold n°4
accuracy: 0.6753
Feature: product_category, Model: lr, fold n°5
accuracy: 0.6755166666666667
Feature: product_category, Model: sgd, fold n°1
accuracy: 0.6699722222222222
Feature: product_category, Model: sgd, fold n°2
accuracy: 0.6725333333333333
Feature: product_category, Model: sgd, fold n°3
accuracy: 0.6712111111111111
Feature: product_category, Model: sgd, fold n°4
accuracy: 0.6708833333333334
Feature: product_category, Model: sgd, fold n°5
accuracy: 0.6711666666666667
Feature: product_category, Model: pac, fold n°1
accuracy: 0.6327611111111111
Feature: product_category, Model: pac, fold n°2
accuracy: 0.6425444444444445
Feature: product_category, Model: pac, fold n°3
accuracy: 0.62125
Feature: product_category, Model: pac, fold n°4
accuracy: 0.6692
Feature: product_category, Model: pac, fold n°5
accuracy: 0.6140222222222222
Feature: product_category, Model: rc, fold n°1
accuracy: 0.6743333333333333
Feature:

In [20]:
train_df

Unnamed: 0,user_id,gender_lr_time,gender_sgd_time,gender_pac_time,gender_rc_time,gender_bnb_time,gender_mnb_time,gender_lr_creative_id,gender_sgd_creative_id,gender_pac_creative_id,...,gender_pac_advertiser_id,gender_rc_advertiser_id,gender_bnb_advertiser_id,gender_mnb_advertiser_id,gender_lr_industry,gender_sgd_industry,gender_pac_industry,gender_rc_industry,gender_bnb_industry,gender_mnb_industry
0,1,0.648696,0.671708,-0.069803,-0.297491,0.462687,0.671713,0.850151,0.889977,-1.125041,...,-0.869602,-0.349708,1.262292e-01,0.631968,0.283545,0.284511,0.547670,0.268886,0.059902,0.495623
1,2,0.669548,0.670299,-1.440428,-0.337093,0.762594,0.668370,0.903168,0.975251,-0.696196,...,-0.574202,-0.597206,9.995696e-01,0.875381,0.962023,0.971308,-3.173565,-0.836132,0.953354,0.792184
2,3,0.669105,0.669386,-1.942986,-0.337123,0.677538,0.665693,0.020663,0.043166,3.844058,...,5.853920,1.565784,2.756222e-14,0.004059,0.156557,0.188017,1.464079,0.297514,0.120992,0.463241
3,4,0.653514,0.670786,-0.166691,-0.306282,0.556943,0.672132,0.835095,0.769629,-0.612942,...,-1.565851,-0.306954,9.708748e-01,0.801061,0.626359,0.587455,-0.140315,-0.138626,0.778883,0.622790
4,5,0.654967,0.665346,0.970220,-0.308868,0.714468,0.664008,0.985063,0.916860,-4.944677,...,-3.929734,-0.750335,9.999999e-01,0.949030,0.878941,0.869791,-1.721603,-0.665969,0.904946,0.731858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,899996,0.680420,0.664727,0.622547,-0.360206,0.524115,0.675844,0.995903,0.966594,-5.710285,...,-2.550493,-0.991033,9.999829e-01,0.986726,0.995254,0.992511,-2.913036,-1.053355,0.972449,0.933512
899996,899997,0.668221,0.665938,-2.315649,-0.335602,0.547137,0.671363,0.296471,0.129484,0.911511,...,0.414493,0.288782,2.591802e-03,0.388146,0.795326,0.815838,-2.274160,-0.470554,0.425104,0.707043
899997,899998,0.628080,0.661724,1.079105,-0.256817,0.457240,0.664080,0.000650,0.000006,4.847863,...,1.359262,0.289542,3.063951e-03,0.247355,0.236660,0.213259,0.028888,0.319250,0.245038,0.508796
899998,899999,0.698420,0.674872,1.249494,-0.396776,0.623190,0.676225,0.751030,0.313170,-2.765417,...,-3.500067,-0.585937,9.936428e-01,0.848625,0.727671,0.747630,0.080758,-0.405357,0.724844,0.646590


In [21]:
test_df

Unnamed: 0,user_id,gender_lr_time,gender_sgd_time,gender_pac_time,gender_rc_time,gender_bnb_time,gender_mnb_time,gender_lr_creative_id,gender_sgd_creative_id,gender_pac_creative_id,...,gender_pac_advertiser_id,gender_rc_advertiser_id,gender_bnb_advertiser_id,gender_mnb_advertiser_id,gender_lr_industry,gender_sgd_industry,gender_pac_industry,gender_rc_industry,gender_bnb_industry,gender_mnb_industry
0,3000001,0.611988,0.667233,-0.522320,-0.227355,0.401027,0.665306,0.959986,0.877059,-3.398812,...,-4.803369,-0.740782,9.999997e-01,0.991810,0.937451,0.916903,-2.215063,-0.477894,0.966735,0.811494
1,3000002,0.674952,0.672262,-0.702131,-0.348080,0.843889,0.662736,0.000551,0.000009,5.866850,...,3.260930,1.132157,7.087798e-11,0.066742,0.174081,0.180758,1.016431,0.377296,0.004175,0.426089
2,3000003,0.673212,0.668312,-0.617968,-0.345335,0.687963,0.669517,0.790954,0.878939,-1.710998,...,-0.241653,-0.292085,5.231506e-01,0.644594,0.618587,0.665873,-0.410200,-0.253997,0.518327,0.643566
3,3000004,0.685008,0.671627,-0.622389,-0.369650,0.581657,0.677478,0.671032,0.877713,-0.401065,...,-1.563362,-0.686009,9.650879e-01,0.841881,0.728556,0.780075,-0.971156,-0.443229,0.723972,0.708266
4,3000005,0.696163,0.670534,-0.942388,-0.391934,0.721877,0.672531,0.929640,0.901753,-2.257529,...,-4.039577,-1.163245,9.999999e-01,0.974483,0.436387,0.396166,0.603080,0.067038,0.486531,0.621512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3999996,0.667522,0.668179,-0.303587,-0.333733,0.774528,0.668007,0.983500,0.975751,-3.425398,...,-3.139954,-0.643942,1.000000e+00,0.980872,0.674566,0.662508,-0.741157,-0.227469,0.881799,0.668348
999996,3999997,0.678694,0.670239,-0.608017,-0.356681,0.596061,0.673025,0.941530,0.943008,-2.807185,...,-0.720845,-0.041923,9.957324e-01,0.869504,0.778683,0.791458,-1.069608,-0.304009,0.860273,0.666850
999997,3999998,0.682950,0.670581,-0.624080,-0.364970,0.614546,0.673231,0.991795,0.974846,-2.631147,...,-4.509791,-1.071400,9.995602e-01,0.928692,0.918071,0.905205,-2.021474,-0.649684,0.974984,0.808529
999998,3999999,0.739746,0.672603,-1.536430,-0.486179,0.913488,0.680087,0.998232,0.975503,-7.094729,...,-6.590877,-1.209877,1.000000e+00,0.998060,0.991472,0.986796,-4.142944,-1.073331,0.999982,0.912866


In [22]:
train_df = train_df.set_index('user_id')
test_df = test_df.set_index('user_id')

In [23]:
train_df.to_pickle(TRAIN_DIR+'tfidf_stack_%s.pkl'%TARGET)

In [24]:
test_df.to_pickle(TEST_DIR+'tfidf_stack_%s.pkl'%TARGET)

## 统计特征

In [100]:
### Per Day Click Times
tr_per_day_clk_times = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['per_day_click'])
ts_per_day_clk_times = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['per_day_click'])

### Kfold Target Encode
tr_kfold_te = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['kfold_te'])
ts_kfold_te = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['kfold_te'])

### Sequence Statistic Feature
tr_seq_statistic = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['seq_statistic'])
ts_seq_statistic = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['seq_statistic'])

### Countvec 特征

In [107]:
tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

assert tr_click_seq['user_id'].values.tolist() == train_df.index.values.tolist()
assert ts_click_seq['user_id'].values.tolist() == test_df.index.values.tolist()

click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)['ad_id'].values.tolist()

cntv = CountVectorizer(tokenizer=lambda x: x, max_df=0.9, min_df=30, lowercase=False)

cntv_user = cntv.fit_transform(click_seq)

cntv_user = cntv_user.astype('float32')

In [121]:
scipy.sparse.save_npz(TRAIN_DIR+'countvec.npz', cntv_user)

### Merge

In [109]:
train_df = pd.concat([tr_per_day_clk_times, tr_kfold_te, tr_seq_statistic, train_df], axis=1)
test_df = pd.concat([ts_per_day_clk_times, ts_kfold_te, ts_seq_statistic, test_df], axis=1)

In [113]:
train_csr = sparse.csr_matrix(train_df.values)
test_csr = sparse.csr_matrix(test_df.values)

In [114]:
train_csr = sparse.hstack((train_csr, cntv_user[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , cntv_user[900000:])).tocsr()

In [115]:
X_train, y_train = train_csr, target['age']-1
X_test = test_csr

# Metric

In [116]:
def accuracy(y_true, y_pred):
    assert len(y_true) == len(y_pred), "length of y_true and y_pred not equal"
    total_example = len(y_true)
    right_cnt = 0
    for t, p in zip(y_true, y_pred):
        if t == p:
            right_cnt += 1
    return right_cnt / total_example

# Model

In [117]:
param = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 10,  
    'metric': ['multi_error'],  
    'num_leaves': 2**9,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':6,
    'n_estimators': 1000
}

In [118]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 200)
    
    y_val_pred = clf.predict(X_train[val_idx])
    y_val_pred = np.argmax(y_val_pred,axis=-1).tolist()
    acc = accuracy(y_train[val_idx].values.tolist(), y_val_pred)
    print("kfold: {:d}, accuracy: {:.4f}".format(fold_+1, acc))
    
    break

fold n°1




Training until validation scores don't improve for 200 rounds
[100]	training's multi_error: 0.359246	valid_1's multi_error: 0.566222
[200]	training's multi_error: 0.194715	valid_1's multi_error: 0.565972
[300]	training's multi_error: 0.0985569	valid_1's multi_error: 0.566239
Early stopping, best iteration is:
[183]	training's multi_error: 0.217036	valid_1's multi_error: 0.565367
kfold: 1, accuracy: 0.4346
