### Changes of v1:
- remove the GBDT model training codes

In [1]:
import seaborn as sns # for correlation heatmap
import numpy as np
import pandas as pd
import tensorflow as tf
import xgboost as xgb
import pushranker
import datetime
import snrf
import importlib
import random
import pickle
import collections
import concurrent.futures
from matplotlib import pyplot as plt
from copy import deepcopy
from pathlib import Path
from xgboost import XGBClassifier, XGBRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from typing import List, Dict, Tuple, DefaultDict
from snrf.feature_column import PairEmbeddingsFusionFeature

In [9]:
importlib.reload(pushranker.targeted_push_gbdt)
importlib.reload(pushranker.scheduled_push_gbdt)
importlib.reload(pushranker.scheduled_push)
importlib.reload(snrf.sparse_util)
importlib.reload(snrf.gbdt)

<module 'snrf.gbdt' from '/home/ec2-user/john.wu/push/sn-ranker-framework/container/snrf/gbdt.py'>

In [3]:
push_type = 'scheduled' # local, targeted, scheduled
model_type = 'GBDT' # GBDT, DNN

train_days = 2
train_day = '0622'
test_day = '0623'

LOCAL_DATA_ROOT = Path('./data/')
TRAIN_DATA_ROOT = str(LOCAL_DATA_ROOT / 'train' / 'edition=en_US' / f'push_type={push_type}' / train_day ) # / 'dt=2021-04-30-00')
TEST_DATA_ROOT = str(LOCAL_DATA_ROOT / 'test' / 'edition=en_US' / f'push_type={push_type}' / test_day) # / 'dt=2021-05-01-00')

training_format = snrf.package.get_obj_from_name(pushranker, 'survival_feature_spec.binarized_format_devicetoken')

if model_type == 'DNN':
    # input_module = pushranker.local_push
    input_module = importlib.import_module(f'pushranker.{push_type}_push')
    hps = getattr(pushranker.hyperparam, push_type)
    push_ranker_model = pushranker.model.make_model_assembly(hps, input_module.input_spec)
    push_ranker_model.hazard_model.load_weights(f'./model/{push_type}-push/push-{model_type}/{train_day}/pushranker')
    pushranker.model.initialize_for_training(push_ranker_model, hps)
    
elif model_type == 'GBDT':
    architecture = snrf.architectures.ranker_architectures.LightGbmRankerArchitecture
    
    input_module = importlib.import_module(f'pushranker.{push_type}_push_gbdt')
    hps = getattr(pushranker.gbdt_hyperparam, f'{push_type}_us')
    features = [
        'u_cate',
        'u_catev2',
        'u_hhs',
        'u_tt_emb',
        'uf_dense_gbdt',
        'a_catev2',
        # 'a_site_id',
        'a_stats_gbdt',
        'pub_stats_gbdt',
        'af_dense_gbdt',
        'a_tt_emb',
        'cgScores',
        'a_u_dense_gbdt',
        PairEmbeddingsFusionFeature('tt_emb_em', ['u_tt_emb', 'a_tt_emb'], 'elementwise_multiply'),
        PairEmbeddingsFusionFeature('tt_emb_dot', ['u_tt_emb', 'a_tt_emb'], 'dot_product'),
    ]
    sparsifier, total_dim = snrf.sparse_util.make_csr_sparsifier(input_module.input_spec, hps, features)
    build_state = snrf.model_components_common.ModelBuildState.build(spec=input_module.input_spec, hyperparams=hps)
    assembly = architecture.make_model_assembly(build_state, hps, features)
    

print(TRAIN_DATA_ROOT)
print(TEST_DATA_ROOT)
print(input_module)

data/train/edition=en_US/push_type=scheduled/0622
data/test/edition=en_US/push_type=scheduled/0623
<module 'pushranker.scheduled_push_gbdt' from '/home/ec2-user/john.wu/push/push-ranker/container/pushranker/scheduled_push_gbdt.py'>


In [20]:
def prepare(root, shuffle=None):
    ds = snrf.tfrecord.read_dataset_from_files(root)
#     ds = ds.shuffle(10000)

    if model_type == 'DNN':
        ds = snrf.tfrecord.prepare_dataset_for_use(
            ds,
            input_module.input_spec,
            shuffle=shuffle,
            training_format=training_format)
        
    elif model_type == 'GBDT':
        ds = ds.take(100_000)
        ds = snrf.gbdt.LightGbmFrameworkWrapper.prepare_dataset_for_use(
            ds,
            spec=input_module.input_spec,
            assembly=assembly,
            training_format=input_module.default_format_fun)
        
    return ds

# train_ds = prepare(TRAIN_DATA_ROOT, 10000)
test_ds = prepare(TEST_DATA_ROOT)
type(test_ds)

lightgbm.basic.Dataset

## Check data

In [22]:
snrf.gbdt.LightGbmFrameworkWrapper.fit(assembly, ds=test_ds)



[LightGBM] [Info] Number of positive: 6076, number of negative: 93924
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 117058
[LightGBM] [Info] Number of data points in the train set: 100000, number of used features: 603
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060760 -> initscore=-2.738139
[LightGBM] [Info] Start training from score -2.738139


In [24]:
importlib.reload(snrf.gbdt)
eval_result = snrf.gbdt.LightGbmFrameworkWrapper.evaluate(assembly=assembly, ds=test_ds)

NotImplementedError: 

In [19]:
g = tds.get_group()
g

In [8]:
## GBDT dataset
len(test_ds)

TypeError: object of type 'Dataset' has no len()

In [5]:
## DNN dataset
devtok1 = list()
label = list()
devtok2 = list()

for feature_batch, label_batch, weight_batch, devtok_batch in test_ds.as_numpy_iterator():
#     print(len(feature_batch['device_token']))
#     print(len(label_batch))
#     print(len(devtok_batch))
#     print()
    for dtk1, l, dtk2 in zip(feature_batch['device_token'], label_batch, devtok_batch):
#         print(f'{dtk1}, {l}, {dtk2}')
        devtok1.append(dtk1)
        label.append(l)
        devtok2.append(dtk2)
# devtok = np.array(devtok)
# label = np.array(label)
print(len(devtok1))
print(len(label))
print(len(devtok2))

15958186
15958186
15958186


## push-GBDT ranker

In [11]:
## push-ranker
model_name = 'pushranker'

if model_type == 'DNN':
    

    
elif model_type == 'GBDT':
    



In [None]:
%%time
dnn_pred_1 = dnn_model_1.training_adapter.predict(test_ds)

In [None]:
dnn_pred_ctr_1 = np.array([s[0] for s in dnn_pred_1])

In [None]:
## DNN push-ranker
# dnn_model_name = f'{train_days}-day'
dnn_model_name = f'6-day--{train_day}'

hps = getattr(pushranker.hyperparam, push_type)
dnn_model_2 = pushranker.model.make_model_assembly(hps, input_module.less_ids_input_spec)
dnn_model_2.hazard_model.load_weights(f'./model/{push_type}-push/DNN/{dnn_model_name}/pushranker')
pushranker.model.initialize_for_training(dnn_model_2, hps)

In [None]:
%%time
dnn_pred_2 = dnn_model_2.training_adapter.predict(test_ds)

In [None]:
dnn_pred_ctr_2 = np.array([s[0] for s in dnn_pred_2])

In [None]:
devtok = list()
label = list()
fy_gbdt_ctr = list()

for feature_batch, label_batch, devtok_batch in test_ds.as_numpy_iterator():
    for f, l, dtk in zip(feature_batch['predicted_ctr'], label_batch, devtok_batch):
        fy_gbdt_ctr.append(f)
        label.append(l)
        devtok.append(dtk[0])
devtok = np.array(devtok)
label = np.array(label)
fy_gbdt_ctr = np.array(fy_gbdt_ctr)

## Predicted CTR Distribution

In [None]:
plt.figure(figsize=(10, 5), dpi=300)
# _ = plt.hist(fy_gbdt_ctr, bins=1000, color='r', label='FY GBDT')
# _ = plt.hist(gbdt_pred_ctr, bins=1000, color='g', label='GBDT push ranker')
_ = plt.hist(dnn_pred_ctr_1, bins=1000, color='b', label='DNN push ranker (1 day)')
_ = plt.hist(dnn_pred_ctr_2, bins=1000, color='y', label='DNN push ranker (6 days)')


plt.xlabel('CTR')
plt.ylabel('Count')
plt.title("Predicted CTR Distribution")
plt.legend(loc="upper right")
plt.show()

## ROC AUC

In [None]:
fpr_gbdt, tpr_gbdt, thresholds_gbdt = roc_curve(test_y, gbdt_pred_ctr)
fpr_fy_gbdt, tpr_fy_gbdt, thresholds_fy_gbdt = roc_curve(label, fy_gbdt_ctr)
fpr_dnn1, tpr_dnn1, thresholds_dnn1 = roc_curve(label, dnn_pred_ctr_1)
fpr_dnn2, tpr_dnn2, thresholds_dnn2 = roc_curve(label, dnn_pred_ctr_2)

gbdt_auc = auc(fpr_gbdt, tpr_gbdt)
fy_gbdt_auc = auc(fpr_fy_gbdt, tpr_fy_gbdt)
dnn1_auc = auc(fpr_dnn1, tpr_dnn1)
dnn2_auc = auc(fpr_dnn2, tpr_dnn2)

print('GBDT push ranker AUC: ', gbdt_auc)
print('FY GBDT AUC: ', fy_gbdt_auc)
print('DNN1 push ranker AUC: ', dnn1_auc)
print('DNN2 push ranker AUC: ', dnn2_auc)


In [None]:
# Plot ROC
plt.figure(figsize=(8, 8), dpi=300)
lw = 2
plt.plot(fpr_fy_gbdt, tpr_fy_gbdt, color='r',
         lw=lw, label='FY GBDT (area = %0.4f)' % fy_gbdt_auc)
plt.plot(fpr_gbdt, tpr_gbdt, color='g',
         lw=lw, label='GBDT push ranker (area = %0.4f)' % gbdt_auc)
plt.plot(fpr_dnn1, tpr_dnn1, color='b',
         lw=lw, label='DNN 1 push ranker (area = %0.4f)' % dnn1_auc)
plt.plot(fpr_dnn2, tpr_dnn2, color='y',
         lw=lw, label='DNN 2 push ranker (area = %0.4f)' % dnn2_auc)
plt.plot([0, 1], [0, 1], color='k', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic curve')
plt.legend(loc="lower right")
plt.show()

## Average Precision

In [None]:
pre_gbdt, rec_gbdt, thresholds_gbdt = precision_recall_curve(test_y, gbdt_pred_ctr)
pre_fy_gbdt, rec_fy_gbdt, thresholds_fy_gbdt = precision_recall_curve(label, fy_gbdt_ctr)
pre_dnn1, rec_dnn1, thresholds_dnn1 = precision_recall_curve(label, dnn_pred_ctr_1)
pre_dnn2, rec_dnn2, thresholds_dnn2 = precision_recall_curve(label, dnn_pred_ctr_2)

gbdt_ap = average_precision_score(test_y, gbdt_pred_ctr)
fy_gbdt_ap = average_precision_score(label, fy_gbdt_ctr)
dnn1_ap = average_precision_score(label, dnn_pred_ctr_1)
dnn2_ap = average_precision_score(label, dnn_pred_ctr_2)

print('GBDT AP: {0:0.4f}'.format(gbdt_ap))
print('FY GBDT AP: {0:0.4f}'.format(fy_gbdt_ap))
print('DNN 1 AP: {0:0.4f}'.format(dnn1_ap))
print('DNN 2 AP: {0:0.4f}'.format(dnn2_ap))

In [None]:
# Plot PR
plt.figure(figsize=(8, 8), dpi=300)
lw = 2
plt.plot(rec_fy_gbdt, pre_fy_gbdt, color='r',
         lw=lw, label='FY GBDT (area = %0.4f)' % fy_gbdt_ap)
plt.plot(rec_gbdt, pre_gbdt, color='g',
         lw=lw, label='GBDT push ranker (area = %0.4f)' % gbdt_ap)
plt.plot(rec_dnn1, pre_dnn1, color='b',
         lw=lw, label='DNN 1 push ranker (area = %0.4f)' % dnn1_ap)
plt.plot(rec_dnn2, pre_dnn2, color='y',
         lw=lw, label='DNN 2 push ranker (area = %0.4f)' % dnn2_ap)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.legend(loc="upper right")
plt.show()

## Overall top-k Precision

In [None]:
gbdt_score_label = list()
fygbdt_score_label = list()
dnn1_score_label = list()
dnn2_score_label = list()
length = len(label)

for i in range(length):
    gbdt_score_label.append((gbdt_pred_ctr[i], label[i]))
    fygbdt_score_label.append((fy_gbdt_ctr[i], label[i]))
    dnn1_score_label.append((dnn_pred_ctr_1[i], label[i]))
    dnn2_score_label.append((dnn_pred_ctr_2[i], label[i]))    

gbdt_score_label.sort(reverse=True)
fygbdt_score_label.sort(reverse=True)
dnn1_score_label.sort(reverse=True)
dnn2_score_label.sort(reverse=True)

In [None]:
topk = [1, 2, 5, 10, 20, 50, 100, 1000, 10000, 100000]

for i in topk:
    print(f'top-{i} precision:')
    print('fy gbdt: {0:0.4f}'.format(sum((l for _, l in fygbdt_score_label[:i]))/i))
    print('gbdt: {0:0.4f}'.format(sum((l for _, l in gbdt_score_label[:i]))/i))    
    print('dnn1: {0:0.4f}'.format(sum((l for _, l in dnn1_score_label[:i]))/i))
    print('dnn2: {0:0.4f}'.format(sum((l for _, l in dnn2_score_label[:i]))/i))    

In [None]:
topk = [10000]

for i in topk:
    print(f'top-{i} precision:')
    print('fy gbdt: {0:0.4f}'.format(sum((l for _, l in fygbdt_score_label[:i]))/i))
    print('gbdt: {0:0.4f}'.format(sum((l for _, l in gbdt_score_label[:i]))/i))
    print('dnn1: {0:0.4f}'.format(sum((l for _, l in dnn1_score_label[:i]))/i))
    print('dnn2: {0:0.4f}'.format(sum((l for _, l in dnn2_score_label[:i]))/i))    

## gAUC

In [None]:
# len_samples = len(devtok)
# user_fy_gbdt = collections.defaultdict(list)
# user_push_gbdt = collections.defaultdict(list)
# user_dnn1 = collections.defaultdict(list)
# user_dnn2 = collections.defaultdict(list)
# for i in range(len_samples):
#     user_fy_gbdt[devtok[i]].append((fy_gbdt_ctr[i], label[i]))
#     user_push_gbdt[devtok[i]].append((gbdt_pred_ctr[i], label[i]))
#     user_dnn1[devtok[i]].append((dnn_pred_ctr_1[i], label[i]))
#     user_dnn2[devtok[i]].append((dnn_pred_ctr_2[i], label[i]))
# assert len(user_fy_gbdt) == len(user_push_gbdt) == len(user_dnn1) == len(user_dnn2)

In [None]:
%%time
len_samples = len(devtok)
user_fy_gbdt = collections.defaultdict(lambda: collections.defaultdict(list))
# user_push_gbdt = collections.defaultdict(lambda: collections.defaultdict(list))
user_dnn1 = collections.defaultdict(lambda: collections.defaultdict(list))
user_dnn2 = collections.defaultdict(lambda: collections.defaultdict(list))
for i in range(len_samples):
    user_fy_gbdt[devtok[i]]['score'].append(fy_gbdt_ctr[i])
    user_fy_gbdt[devtok[i]]['label'].append(label[i])
#     user_push_gbdt[devtok[i]]['score'].append(gbdt_pred_ctr[i])
#     user_push_gbdt[devtok[i]]['label'].append(label[i])
    user_dnn1[devtok[i]]['score'].append(dnn_pred_ctr_1[i])
    user_dnn1[devtok[i]]['label'].append(label[i])
    user_dnn2[devtok[i]]['score'].append(dnn_pred_ctr_2[i])
    user_dnn2[devtok[i]]['label'].append(label[i])    
# assert len(user_fy_gbdt) == len(user_push_gbdt) == len(user_dnn1) == len(user_dnn2)

In [None]:
# number of users
len(user_fy_gbdt)

In [None]:
%%time

## get to-be-evaluated users
# dicts = [user_fy_gbdt, user_push_gbdt, user_dnn1, user_dnn2]
dicts = [user_fy_gbdt, user_dnn1, user_dnn2]

def getTBEUsers(d) -> Dict:
    rd = deepcopy(d)
    for user, candidates in d.items():
        l = len(candidates['label'])
        s = sum(candidates['label'])
        if s == 0 or s == l:
            rd.pop(user, None)
    return rd

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(getTBEUsers, param) for param in dicts]
user_fy_gbdt = futures[0].result()
# user_push_gbdt = futures[1].result()
user_dnn1 = futures[1].result()
user_dnn2 = futures[2].result()

In [None]:
# number of users to be evaluated
print(len(user_fy_gbdt))
# print(len(user_push_gbdt))
print(len(user_dnn1))
print(len(user_dnn2))

In [None]:
user_fy_gbdt == user_dnn1

In [None]:
def gAUC(userNews: DefaultDict) -> Tuple[List]:
    gauc = []
    zero = []
    half = []
    one = []
    for d in userNews.values():
        score = d['score']
        label = d['label']
        s = sum(label)
        if s > 0 and s != len(label):
            fpr, tpr, _ = roc_curve(label, score)
            a = auc(fpr, tpr)
            if a == 0:
                zero.append(len(label))
            elif a == 0.5:
                half.append(len(label))
            elif a == 1:
                one.append(len(label))
            gauc.append(a)
    return gauc, zero, half, one
#     return sum(gauc)/len(gauc)

In [None]:
%%time
# models = [user_fy_gbdt, user_push_gbdt, user_dnn1, user_dnn2]
models = [user_fy_gbdt, user_dnn1, user_dnn2]
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(gAUC, param) for param in models]

In [None]:
gauc_fy_gbdt, fygbdt_z, fygbdt_h, fygbdt_o = futures[0].result()
# gauc_push_gbdt, pushgbdt_z, pushgbdt_h, pushgbdt_o = futures[1].result()
gauc_dnn1, dnn1_z, dnn1_h, dnn1_o = futures[1].result()
gauc_dnn2, dnn2_z, dnn2_h, dnn2_o = futures[2].result()

In [None]:
print(len(fygbdt_z), len(fygbdt_h), len(fygbdt_o))
# print(len(pushgbdt_z), len(pushgbdt_h), len(pushgbdt_o))
print(len(dnn1_z), len(dnn1_h), len(dnn1_o))
print(len(dnn2_z), len(dnn2_h), len(dnn2_o))

In [None]:
plt.figure(figsize=(10, 5), dpi=200)
_ = plt.hist(fygbdt_z, bins=50, alpha=0.5, color='r', label='gAUC == 0')
_ = plt.hist(fygbdt_o, bins=50, alpha=0.5, color='b', label='gAUC == 1')
_ = plt.hist(fygbdt_h, bins=50, alpha=0.5, color='g', label='gAUC == 0.5')

plt.xlabel('#candidates')
plt.ylabel('Count')
plt.title("#candidates Distribution")
plt.legend(loc="upper right")
plt.show()

In [None]:
plt.figure(figsize=(10, 5), dpi=200)
_ = plt.hist(dnn2_z, bins=50, alpha=0.5, color='r', label='gAUC == 0')

_ = plt.hist(dnn2_o, bins=50, alpha=0.5, color='b', label='gAUC == 1')
_ = plt.hist(dnn2_h, bins=50, alpha=0.5, color='g', label='gAUC == 0.5')

plt.xlabel('#candidates')
plt.ylabel('Count')
plt.title("#candidates Distribution")
plt.legend(loc="upper right")
plt.show()

In [None]:
# print(len(gauc_fy_gbdt), len(gauc_push_gbdt), len(gauc_dnn1))#, len(gauc_dnn2))
print(len(gauc_fy_gbdt), len(gauc_dnn1), len(gauc_dnn2))

In [None]:
# plt.figure(figsize=(10, 5), dpi=300)
# _ = plt.hist(gauc_fy_gbdt, bins=100, color='r', alpha=0.5, label='fy-gbdt')
# _ = plt.hist(gauc_push_gbdt, bins=100, color='g', alpha=0.5, label='push-gbdt')

# plt.xlabel('gAUC')
# plt.ylabel('Count')
# plt.title("gAUC Distribution")
# plt.legend(loc="best")
# plt.show()

In [None]:
plt.figure(figsize=(10, 5), dpi=300)
_ = plt.hist(gauc_fy_gbdt, bins=100, color='r', alpha=0.5, label='fy-gbdt')
_ = plt.hist(gauc_dnn1, bins=100, color='g', alpha=0.5, label='dnn1')
_ = plt.hist(gauc_dnn2, bins=100, color='b', alpha=0.5, label='dnn2')

plt.xlabel('gAUC')
plt.ylabel('Count')
plt.title("gAUC Distribution")
plt.legend(loc="best")
plt.show()

In [None]:
print(sum(gauc_fy_gbdt)/len(gauc_fy_gbdt))
# print(sum(gauc_push_gbdt)/len(gauc_push_gbdt))
print(sum(gauc_dnn1)/len(gauc_dnn1))
print(sum(gauc_dnn2)/len(gauc_dnn2))

## number of candidates distribution for different gAUC (0, 0.5 1)

In [None]:
%%time

## users filtering out: all candidates are True or all candidates are False
gauc_fy_gbdt = []
gauc_push_gbdt = []
gauc_dnn1 = []
gauc_dnn2 = []

for d in user_fy_gbdt.values():
    score = d['score']
    label = d['label']
    if sum(label) > 0 and sum(label) != len(label):
        fpr, tpr, _ = roc_curve(label, score)
        gauc_fy_gbdt.append(auc(fpr, tpr))
print(sum(gauc_fy_gbdt)/len(gauc_fy_gbdt))
        
for d in user_push_gbdt.values():
    score = d['score']
    label = d['label']
    if sum(label) > 0 and sum(label) != len(label):
        fpr, tpr, _ = roc_curve(label, score)
        gauc_push_gbdt.append(auc(fpr, tpr))
print(sum(gauc_push_gbdt)/len(gauc_push_gbdt))
      
for d in user_dnn1.values():
    score = d['score']
    label = d['label']
    if sum(label) > 0 and sum(label) != len(label):
        fpr, tpr, _ = roc_curve(label, score)
        gauc_dnn1.append(auc(fpr, tpr))
print(sum(gauc_dnn1)/len(gauc_dnn1))
      
for d in user_dnn2.values():
    score = d['score']
    label = d['label']
    if sum(label) > 0 and sum(label) != len(label):
        fpr, tpr, _ = roc_curve(label, score)
        gauc_dnn2.append(auc(fpr, tpr))
print(sum(gauc_dnn2)/len(gauc_dnn2))

In [None]:
%%time
# Concurrent process

def gAUC(userNews: DefaultDict) -> float:
    gauc = []
    for d in userNews.values():
        score = d['score']
        label = d['label']
        s = sum(label)
        if s > 0 and s != len(label):
            fpr, tpr, _ = roc_curve(label, score)
            gauc.append(auc(fpr, tpr))
    return sum(gauc)/len(gauc)
       
models = [user_fy_gbdt, user_push_gbdt, user_dnn1, user_dnn2]
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(gAUC, param) for param in models]
for f in futures:
    print(f.result())

In [None]:
# topk = [1, 2, 3]

# def getMeanTopkPrecision(d):
#     for k in topk:
#         p = 0
#         for _, scores_labels in d.items():
#             scores_labels.sort(reverse=True)
#             p += sum(l for _, l in scores_labels[:k])/k
#         p /= len(d)
#         yield p
       
# print(f'fy-gbdt:', list(getMeanTopkPrecision(user_fy_gbdt)))
# print(f'push-gbdt:', list(getMeanTopkPrecision(user_push_gbdt)))
# print(f'dnn:', list(getMeanTopkPrecision(user_dnn)))

In [None]:
# positive sample rate
sum(label)/len(label)

In [None]:
# average number of samples per user
sum(len(article) for article in user_fy_gbdt.values())/len(user_fy_gbdt)

In [None]:
# how many users have more than 1 article
print(f'#total users: {len(user_fy_gbdt)}')
print(f'#users have 1 article: {sum(1 for article in user_fy_gbdt.values() if len(article) == 1)}')
print(f'#users have 1 article and it is pos: {sum(1 for article in user_fy_gbdt.values() if (len(article) == 1) and sum(l for _, l in article)>0)}')
print(f'#users have 2 article: {sum(1 for article in user_fy_gbdt.values() if len(article) == 2)}')
print(f'#users have more than 2 article: {sum(1 for article in user_fy_gbdt.values() if len(article) > 2)}')

In [None]:
# filter out users 