In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
sys.path.append('../../')

from config import *
import pandas as pd
from tools import *
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm
from scipy import sparse
import matplotlib.pyplot as plt
import scipy

In [3]:
### target
target = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
target = target.groupby(['user_id']).agg('first').reset_index()

## 点击list TFIDF

In [4]:
tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)['ad_id'].values.tolist()

tfidf = TfidfVectorizer(tokenizer=lambda x: x, max_df=0.9, min_df=30, lowercase=False)
tfidf_user = tfidf.fit_transform(click_seq)

tfidf_user = tfidf_user.astype('float32')

In [5]:
train_csr = tfidf_user[:900000]
test_csr  = tfidf_user[900000:]

In [6]:
X_train, Y_train = train_csr, target['gender']-1
X_test = test_csr

In [7]:
# # 测试集，画图对预测值和实际值进行比较
# def test_validate(x_test, y_test, y_predict, classifier):
#     x = range(len(y_test))
#     plt.plot(x, y_test, "ro", markersize=5, zorder=3, label=u"true_v")
#     plt.plot(x, y_predict, "go", markersize=8, zorder=2, label=u"predict_v,$R^2$=%.3f" % classifier.score(x_test, y_test))
#     plt.legend(loc="upper left")
#     plt.xlabel("number")
#     plt.ylabel("true?")
#     plt.show()

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV

In [9]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))

In [21]:
train_df = tr_click_seq[['user_id']]
test_df = ts_click_seq[['user_id']]

In [22]:
STACKING_FEAT = ['lr', 'sgd', 'pac', 'rc', 'bnb', 'mnb']
STACKING_MODEL = [LogisticRegression(n_jobs=6), CalibratedClassifierCV(SGDClassifier(early_stopping=True, n_jobs=6)),
                 PassiveAggressiveClassifier(early_stopping=True, n_jobs=6), RidgeClassifier(),
                 BernoulliNB(), MultinomialNB()]

STACKING_MODEL = dict(zip(STACKING_FEAT, STACKING_MODEL))

In [23]:
for feat in STACKING_FEAT:
    train_df['gender_'+feat] = 0.
    test_df['gender_'+feat] = 0.

In [29]:
feat = STACKING_FEAT[0]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.predict_proba(x_val)[:, 0]
    predict_age += model.predict_proba(X_test)[:, 0]

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.9121111111111111
fold n°2
accuracy: 0.9106555555555556
fold n°3
accuracy: 0.9112388888888889
fold n°4
accuracy: 0.9108944444444445
fold n°5
accuracy: 0.9110388888888888


In [30]:
feat = STACKING_FEAT[1]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.predict_proba(x_val)[:, 0]
    predict_age += model.predict_proba(X_test)[:, 0]

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.8763888888888889
fold n°2
accuracy: 0.8762666666666666
fold n°3
accuracy: 0.8753944444444445
fold n°4
accuracy: 0.8764833333333333
fold n°5
accuracy: 0.8765722222222222


In [32]:
feat = STACKING_FEAT[2]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.decision_function(x_val)
    predict_age += model.decision_function(X_test)

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.8856166666666667
fold n°2
accuracy: 0.8874555555555556
fold n°3
accuracy: 0.8874888888888889
fold n°4
accuracy: 0.8867055555555555
fold n°5
accuracy: 0.8895222222222222


In [33]:
feat = STACKING_FEAT[3]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.decision_function(x_val)
    predict_age += model.decision_function(X_test)

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.8966388888888889
fold n°2
accuracy: 0.8964555555555556
fold n°3
accuracy: 0.8953888888888889
fold n°4
accuracy: 0.8957
fold n°5
accuracy: 0.8970222222222223


In [35]:
feat = STACKING_FEAT[4]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.predict_proba(x_val)[:, 0]
    predict_age += model.predict_proba(X_test)[:, 0]

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.9115666666666666
fold n°2
accuracy: 0.9114111111111111
fold n°3
accuracy: 0.9095888888888889
fold n°4
accuracy: 0.9111611111111111
fold n°5
accuracy: 0.91175


In [34]:
feat = STACKING_FEAT[5]
predict_age = np.zeros((len(test_df), ))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
    print("fold n°{}".format(fold_+1))
    x_train, y_train = X_train[trn_idx], Y_train[trn_idx]
    x_val, y_val = X_train[val_idx], Y_train[val_idx]

    model = STACKING_MODEL[feat]
    model.fit(x_train, y_train)

    r = model.score(x_val, y_val)
    print("accuracy:", r)

    train_df.loc[val_idx, ['gender_%s'% feat]] = model.predict_proba(x_val)[:, 0]
    predict_age += model.predict_proba(X_test)[:, 0]

test_df.loc[:, ['gender_%s'% feat]] = predict_age/N_SPLITS

fold n°1
accuracy: 0.9117222222222222
fold n°2
accuracy: 0.9114833333333333
fold n°3
accuracy: 0.9099055555555555
fold n°4
accuracy: 0.9102888888888889
fold n°5
accuracy: 0.91165


In [36]:
train_df = train_df.set_index('user_id')
test_df = test_df.set_index('user_id')

In [37]:
train_df.to_pickle(TRAIN_DIR+'tfidf_stack_gender.pkl')

In [38]:
test_df.to_pickle(TEST_DIR+'tfidf_stack_gender.pkl')

## 统计特征

In [100]:
### Per Day Click Times
tr_per_day_clk_times = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['per_day_click'])
ts_per_day_clk_times = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['per_day_click'])

### Kfold Target Encode
tr_kfold_te = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['kfold_te'])
ts_kfold_te = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['kfold_te'])

### Sequence Statistic Feature
tr_seq_statistic = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['seq_statistic'])
ts_seq_statistic = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['seq_statistic'])

### Countvec 特征

In [107]:
tr_click_seq = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['ad_id'])
ts_click_seq = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['ad_id'])

assert tr_click_seq['user_id'].values.tolist() == train_df.index.values.tolist()
assert ts_click_seq['user_id'].values.tolist() == test_df.index.values.tolist()

click_seq = pd.concat([tr_click_seq, ts_click_seq], axis=0)['ad_id'].values.tolist()

cntv = CountVectorizer(tokenizer=lambda x: x, max_df=0.9, min_df=30, lowercase=False)

cntv_user = cntv.fit_transform(click_seq)

cntv_user = cntv_user.astype('float32')

In [121]:
scipy.sparse.save_npz(TRAIN_DIR+'countvec.npz', cntv_user)

### Merge

In [109]:
train_df = pd.concat([tr_per_day_clk_times, tr_kfold_te, tr_seq_statistic, train_df], axis=1)
test_df = pd.concat([ts_per_day_clk_times, ts_kfold_te, ts_seq_statistic, test_df], axis=1)

In [113]:
train_csr = sparse.csr_matrix(train_df.values)
test_csr = sparse.csr_matrix(test_df.values)

In [114]:
train_csr = sparse.hstack((train_csr, cntv_user[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , cntv_user[900000:])).tocsr()

In [115]:
X_train, y_train = train_csr, target['age']-1
X_test = test_csr

# Metric

In [116]:
def accuracy(y_true, y_pred):
    assert len(y_true) == len(y_pred), "length of y_true and y_pred not equal"
    total_example = len(y_true)
    right_cnt = 0
    for t, p in zip(y_true, y_pred):
        if t == p:
            right_cnt += 1
    return right_cnt / total_example

# Model

In [117]:
param = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 10,  
    'metric': ['multi_error'],  
    'num_leaves': 2**9,  
    'min_data_in_leaf': 500,  
    'learning_rate': 0.1,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1,
    'num_threads':6,
    'n_estimators': 1000
}

In [118]:
N_SPLITS = 5
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=np.random.randint(2020))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 200)
    
    y_val_pred = clf.predict(X_train[val_idx])
    y_val_pred = np.argmax(y_val_pred,axis=-1).tolist()
    acc = accuracy(y_train[val_idx].values.tolist(), y_val_pred)
    print("kfold: {:d}, accuracy: {:.4f}".format(fold_+1, acc))
    
    break

fold n°1




Training until validation scores don't improve for 200 rounds
[100]	training's multi_error: 0.359246	valid_1's multi_error: 0.566222
[200]	training's multi_error: 0.194715	valid_1's multi_error: 0.565972
[300]	training's multi_error: 0.0985569	valid_1's multi_error: 0.566239
Early stopping, best iteration is:
[183]	training's multi_error: 0.217036	valid_1's multi_error: 0.565367
kfold: 1, accuracy: 0.4346
