In [9]:
import importlib
import snrf
import pushranker
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple

In [10]:
push_type = 'scheduled' # local, targeted, scheduled
day = '0424'

LOCAL_DATA_ROOT = Path('../data/')
TRAIN_DATA_ROOT = str(LOCAL_DATA_ROOT / 'train' / 'edition=en_US' / f'push_type={push_type}' / day) # / 'dt=2021-04-30-00')
# TEST_DATA_ROOT = str(LOCAL_DATA_ROOT / 'test' / 'edition=en_US' / f'push_type={push_type}') # / 'dt=2021-05-01-00')

training_format = snrf.package.get_obj_from_name(pushranker, 'survival_feature_spec.binarized_format')

# input_module = pushranker.local_push
input_module = importlib.import_module(f'pushranker.{push_type}_push')

print(TRAIN_DATA_ROOT)
# print(TEST_DATA_ROOT)

data/train/edition=en_US/push_type=scheduled/0424


In [11]:
def prepare(root, shuffle=None):
    ds = snrf.tfrecord.read_dataset_from_files(root)
#     ds = ds.shuffle(10000)
    ds = snrf.tfrecord.prepare_dataset_for_use(ds,
                                               input_module.input_spec,
                                               shuffle=shuffle,
                                               training_format=training_format)
    return ds

train_ds = prepare(TRAIN_DATA_ROOT, 10000)
# test_ds = prepare(TEST_DATA_ROOT)
type(train_ds)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [12]:
# Transform push-ranker data for GBDT
numeric_features = ['predicted_ctr']
dense_features = ['cgScores', 'a_stats', 'af_dense', 'uf_dense', 'u_hhs']
dense_features_len = [10, 36, 9, 10, 24]
# sparse_features = [
#     'push_id', 'u_cate','u_catev2', 'u_hist_ids', 'u_pub', 'u_pub_ctr', 'u_kw',
#     'a_catev2', 'a_features', 'a_site_id', 'a_kw', 'a_tw'
# ]

def build_numeric_feature(batch) -> List[float]:
    samples = list()
    i = 0
    while True:
        value = list()
        try:
            for f in numeric_features:
                value.append(batch[0][f][i])
        except IndexError:
            break
        else:
            samples.append(value)
            i += 1
    return samples
    

def build_dense_feature(batch) -> List[List[float]]:
    samples = list()
    i = 0
    while True:
        feat_vec = list()
        try:
            for f in dense_features:
                feat_vec.extend(list(batch[0][f][i]))
        except IndexError:
            break
        else:
            samples.append(feat_vec)
            i += 1
    return samples
    

def build_sparse_feature_dict(batch) -> List[Dict]:
    samples = list()
    i = 0
    while True:
        featureDict = dict()
        try:
            for f in sparse_features:
                if f == 'push_id':
                    featureDict[f] = str(batch[0][f][i])  
                else:
                    featureDict[f] = [str(val) for idx, val in np.ndenumerate(batch[0][f][i]) if val]       
        except IndexError:
            break
        else:
            samples.append(featureDict)
            i += 1
    return samples

In [13]:
# prepare training data
numeric_feat = list()
dense_feat = list()
sparse_samples = list()

for batch in train_ds.as_numpy_iterator():
    numeric_feat.extend(build_numeric_feature(batch))
    dense_feat.extend(build_dense_feature(batch))
#     sparse_samples.extend(build_sparse_feature_dict(batch))
    
numeric_feature = np.array(numeric_feat)
numeric_feat.clear()
dense_feature = np.array(dense_feat)
dense_feat.clear()
# dense_feature_column = [f'{df}_{i}' for df, l in zip(dense_features, dense_features_len) for i in range(l)]

# build sparse features transformer
# vec = DictVectorizer()
# sparse_feature = vec.fit_transform(sparse_samples).toarray()
# sparse_samples.clear()
# sparse_feature_column = vec.feature_names_
# sparse_feature.shape

In [14]:
# merge training data
# feature_columns = numeric_features + dense_feature_column# + sparse_feature_column
train_X = np.hstack((numeric_feature, dense_feature))#, sparse_feature))
train_X.shape

(32984524, 90)

In [15]:
np.save(str(LOCAL_DATA_ROOT / 'gbdt' / f'{push_type}-trainX-{day}.npy'), train_X)

In [16]:
# prepare labels
train_y = list()

for train in train_ds:
    train_y.extend(list(np.array(train[1])))
train_y = np.array(train_y)
print(len(train_y))
np.save(str(LOCAL_DATA_ROOT / 'gbdt' / f'{push_type}-trainy-{day}.npy'), train_y)


32984524
