In [13]:
import tensorflow as tf
from pathlib import Path
import numpy as np

batch_size = 256
hist_size = 30
data_dict = {}
feed_dict = {}
batch_idx = 0
feature_size = 1048573
epoch = 25

def data_set(data_dict, feature, string):
        if string not in data_dict:
             data_dict[string] =[[feature]]
        else:
             if(len(data_dict[string]) < batch_idx + 1):
                 data_dict[string].append([feature])
             else:
                 data_dict[string][batch_idx].append(feature)

def input_data_set(data_dict, features, prefix=""):
    for feature in features:
        feature = feature.split(":")
        feature = int(feature[0])
        group_id = feature >> 48
        feature = feature % feature_size 
        data_set(data_dict, feature, prefix+str(group_id))

def input_hist_data_set(data_dict, hist_features, hist_group_ids, pos_group_ids, hist_size, prefix=""):
    hist_len = len(hist_features)
    if hist_features[0] == '\n' or hist_features[0] == '' or hist_features[0] == ' ':
          hist_len = 0
    for i in range(0, hist_size):
        if i < hist_len:
            features = hist_features[i].split()
            for feature in features:
                 feature = feature.split(":")
                 feature = int(feature[0])
                 group_id = feature >> 48
                 feature = feature % feature_size
                 if group_id in pos_group_ids:
                       data_set(data_dict, feature, prefix+"position_"+str(i)+"_"+str(group_id))
                 else:
                       data_set(data_dict, feature, prefix+str(i)+"_"+str(group_id))
        else:
            for group_id in hist_group_ids:
                 data_set(data_dict, 0, prefix+str(i)+"_"+str(group_id))
            for group_id in pos_group_ids:
                 data_set(data_dict, 0, prefix+"position_"+str(i)+"_"+str(group_id))
             
    if prefix+"histLen" not in data_dict:
            data_dict[prefix+"histLen"] = [hist_len]
    else:
            data_dict[prefix+"histLen"].append(hist_len)

def data_dict_sparse_feature(data_dict, string):
    index, value = [], []
    for i in range(batch_size):
           for k in range(len(data_dict[string][i])):
                index.append(np.array([i, k], dtype = np.int64))
                value.append(data_dict[string][i][k])
    iv = tf.sparse.SparseTensor(index, value, [len(data_dict[string]), feature_size])
    data_dict[string] = iv


def train_data_process(data, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids):
    global data_dict, feed_dict, batch_idx, batch_size
    data = data.split('\t')
    label = float(data[0])
    weight = float(data[1])
    features = data[2].split('|')
    main_features = features[0].split()
    candidate_features = features[1].split()
    clicked_features = features[2].split(';')
    unclick_features = features[3].split(';')
    feedback_features = features[4].split(';')
    if "label" not in data_dict:
        data_dict["label"] = [label]
    else:
        data_dict["label"].append(label)
    
    if "weight" not in data_dict:
        data_dict["weight"] = [weight]
    else:
        data_dict["weight"].append(weight)
    
    input_data_set(data_dict, main_features, "main_")
    input_data_set(data_dict, candidate_features, "candidate_")
    input_hist_data_set(data_dict, clicked_features, clicked_group_ids, pos_group_ids, hist_size, "clicked_")
    input_hist_data_set(data_dict, unclick_features, unclick_group_ids, pos_group_ids, hist_size, "unclick_")
    input_hist_data_set(data_dict, feedback_features, feedback_group_ids, pos_group_ids, hist_size, "feedback_")


def data_gen(path):
    global batch_idx, batch_size, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids
    while True:
        f = path.open(mode='r')
        line = f.readline()
        while line:
            train_data_process(line, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids)
            if batch_idx < batch_size -1: 
                batch_idx += 1
            else:
                for group_id in main_group_ids:
                    data_name = "main_" + str(group_id)
                    data_dict_sparse_feature(data_dict, data_name)
                for group_id in candidate_group_ids:
                    data_name = "candidate_" + str(group_id)
                    data_dict_sparse_feature(data_dict, data_name)
                for i in range(hist_size):
                    for group_id in clicked_group_ids:
                        data_name = "clicked_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name) 
                    for group_id in unclick_group_ids:
                        data_name = "unclick_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name) 
                    for group_id in feedback_group_ids:
                        data_name = "feedback_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name)
                    for group_id in pos_group_ids:   
                        data_name = "clicked_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name)
                        data_name = "unclick_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name)
                        data_name = "feedback_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name)
                data_input = {k: v for k, v in data_dict.items() if k != "label" and k != "weight"}
                yield (data_input, data_dict["label"], data_dict["weight"])
            line = f.readline()
        f.close()

In [25]:
main_group_ids=[16,10001,10002,10003,21,10006,10019,10034,20147,20148,10035,20156,61,10047,10048,10049,10050,10055,10056,60, 46, 48, 50, 122]
candidate_group_ids=[3060,3061,3062,3063,3064]
clicked_group_ids=[3060,3061,3062,3063,3064]
unclick_group_ids=[3060,3061,3062,3063,3064]
feedback_group_ids=[3060,3061,3063,3064]
pos_group_ids=[3065]

path = Path("/Volumes/D/guohao/resys/dfn/example")
a = next(data_gen(path,))

FileNotFoundError: [Errno 2] No such file or directory: '\\Volumes\\D\\guohao\\resys\\dfn\\example'

In [16]:
list(a[0].keys())

['main_16',
 'main_21',
 'main_60',
 'main_61',
 'main_20147',
 'main_46',
 'main_48',
 'main_50',
 'main_122',
 'main_10002',
 'main_10001',
 'main_10003',
 'main_10006',
 'main_10055',
 'main_10056',
 'main_10048',
 'main_20156',
 'main_10050',
 'main_10034',
 'main_10035',
 'main_10019',
 'main_20148',
 'main_10033',
 'main_10065',
 'main_10049',
 'main_10047',
 'candidate_3060',
 'candidate_3061',
 'candidate_3062',
 'candidate_3063',
 'candidate_3064',
 'candidate_3065',
 'clicked_0_3060',
 'clicked_0_3061',
 'clicked_0_3062',
 'clicked_0_3063',
 'clicked_0_3064',
 'clicked_position_0_3065',
 'clicked_1_3060',
 'clicked_1_3061',
 'clicked_1_3062',
 'clicked_1_3063',
 'clicked_1_3064',
 'clicked_position_1_3065',
 'clicked_2_3060',
 'clicked_2_3061',
 'clicked_2_3062',
 'clicked_2_3063',
 'clicked_2_3064',
 'clicked_position_2_3065',
 'clicked_3_3060',
 'clicked_3_3061',
 'clicked_3_3062',
 'clicked_3_3063',
 'clicked_3_3064',
 'clicked_position_3_3065',
 'clicked_4_3060',
 'clicke

In [21]:
tf.sparse.to_dense(a[0]["clicked_position_17_3065"])

<tf.Tensor: shape=(256, 1048573), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>

In [11]:
b = tf.sparse.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])

In [25]:
attQ_w = tf.Variable(tf.keras.initializers.GlorotNormal()(shape=[100, 4]), name="a" + "attQ_w" + str(1), dtype=tf.float32)

In [1]:
import tensorflow as tf

In [74]:
import tensorflow as tf
from pathlib import Path
import numpy as np

batch_size = 256
hist_size = 30
data_dict = {}
feed_dict = {}
batch_idx = 0
feature_size = 1048573
epoch = 25

def data_set(data_dict, feature, string):
        if string not in data_dict:
             data_dict[string] =[[feature]]
        else:
             if(len(data_dict[string]) < batch_idx + 1):
                 data_dict[string].append([feature])
             else:
                 data_dict[string][batch_idx].append(feature)

def input_data_set(data_dict, features, prefix=""):
    global main_group_ids
    for feature in features:
        feature = feature.split(":")
        feature = int(feature[0])
        group_id = feature >> 48
        feature = feature % feature_size
        if prefix == "main_":
            if group_id not in main_group_ids:
                continue             
        data_set(data_dict, feature, prefix+str(group_id))

def input_hist_data_set(data_dict, hist_features, hist_group_ids, pos_group_ids, hist_size, prefix=""):
    hist_len = len(hist_features)
    if hist_features[0] == '\n' or hist_features[0] == '' or hist_features[0] == ' ':
          hist_len = 0
    for i in range(0, hist_size):
        if i < hist_len:
            features = hist_features[i].split()
            for feature in features:
                 feature = feature.split(":")
                 feature = int(feature[0])
                 group_id = feature >> 48
                 feature = feature % feature_size
                 if group_id in pos_group_ids:
                       data_set(data_dict, feature, prefix+"position_"+str(i)+"_"+str(group_id))
                 else:
                       data_set(data_dict, feature, prefix+str(i)+"_"+str(group_id))
        else:
            for group_id in hist_group_ids:
                 data_set(data_dict, 0, prefix+str(i)+"_"+str(group_id))
            for group_id in pos_group_ids:
                 data_set(data_dict, 0, prefix+"position_"+str(i)+"_"+str(group_id))
             
    if prefix+"histLen" not in data_dict:
            data_dict[prefix+"histLen"] = [hist_len]
    else:
            data_dict[prefix+"histLen"].append(hist_len)

def data_dict_sparse_feature(data_dict, string, dtype):
    index, value = [], []
    for i in range(batch_size):
           for k in range(len(data_dict[string][i])):
                index.append(np.array([i, k], dtype = np.int64))
                value.append(data_dict[string][i][k])
    iv = tf.sparse.SparseTensor(index, value, [len(data_dict[string]), feature_size])
    iv = tf.cast(iv, dtype=dtype)
    data_dict[string] = iv


def train_data_process(data, data_dict, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids):
    data = data.split('\t')
    label = float(data[0])
    weight = float(data[1])
    features = data[2].split('|')
    main_features = features[0].split()
    candidate_features = features[1].split()
    clicked_features = features[2].split(';')
    unclick_features = features[3].split(';')
    feedback_features = features[4].split(';')
    if "label" not in data_dict:
        data_dict["label"] = [label]
    else:
        data_dict["label"].append(label)
    
    if "weight" not in data_dict:
        data_dict["weight"] = [weight]
    else:
        data_dict["weight"].append(weight)
    
    input_data_set(data_dict, main_features, "main_")
    input_data_set(data_dict, candidate_features, "candidate_")
    input_hist_data_set(data_dict, clicked_features, clicked_group_ids, pos_group_ids, hist_size, "clicked_")
    input_hist_data_set(data_dict, unclick_features, unclick_group_ids, pos_group_ids, hist_size, "unclick_")
    input_hist_data_set(data_dict, feedback_features, feedback_group_ids, pos_group_ids, hist_size, "feedback_")


def data_gen(path):
    global batch_idx, data_dict, batch_size, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids
    while True:
        f = path.open(mode='r')
        line = f.readline()
        while line:
            train_data_process(line, data_dict, main_group_ids, candidate_group_ids, clicked_group_ids, unclick_group_ids, feedback_group_ids, pos_group_ids)
            if batch_idx < batch_size -1: 
                batch_idx += 1
            else:
                for group_id in main_group_ids:
                    data_name = "main_" + str(group_id)
                    data_dict_sparse_feature(data_dict, data_name, tf.int32)
                for group_id in candidate_group_ids:
                    data_name = "candidate_" + str(group_id)
                    data_dict_sparse_feature(data_dict, data_name, tf.int32)
                for i in range(hist_size):
                    for group_id in clicked_group_ids:
                        data_name = "clicked_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32) 
                    for group_id in unclick_group_ids:
                        data_name = "unclick_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32) 
                    for group_id in feedback_group_ids:
                        data_name = "feedback_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32)
                    for group_id in pos_group_ids:   
                        data_name = "clicked_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32)
                        data_name = "unclick_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32)
                        data_name = "feedback_position_" + str(i) + "_" + str(group_id)
                        data_dict_sparse_feature(data_dict, data_name, tf.int32)
                    data_dict["clicked_histLen"] = tf.convert_to_tensor(data_dict["clicked_histLen"], dtype=tf.float32)
                    data_dict["unclick_histLen"] = tf.convert_to_tensor(data_dict["unclick_histLen"], dtype=tf.float32)
                    data_dict["feedback_histLen"] = tf.convert_to_tensor(data_dict["feedback_histLen"], dtype=tf.float32)
                    data_dict["label"] = tf.convert_to_tensor(data_dict["label"], dtype=tf.float32)
                    data_dict["weight"] = tf.convert_to_tensor(data_dict["weight"], dtype=tf.float32)
                data_input = {k: v for k, v in data_dict.items() if k != "label" and k != "weight"}
                yield (data_input, data_dict["label"], data_dict["weight"])
                batch_idx = 0
                data_dict = {}
            line = f.readline()
        f.close()

In [75]:
main_group_ids=[16,10001,10002,10003,21,10006,10019,10034,20147,20148,10035,20156,61,10047,10048,10049,10050,10055,10056,60]
candidate_group_ids=[3060,3061,3062,3063,3064]
clicked_group_ids=[3060,3061,3062,3063,3064]
unclick_group_ids=[3060,3061,3062,3063,3064]
feedback_group_ids=[3060,3061,3063,3064]
pos_group_ids=[3065]
path = Path(r"E:\ML_study\deepctr\dfn_tf2\example")
# a = next(data_gen(path))

In [76]:
i = 0
for v in data_gen(path):
    if i <= 2:
#         print(i, v)
        i += 1
    else:
        break

In [39]:
b = next(data_gen(path))
b

TypeError: object of type 'SparseTensor' has no len()

In [56]:
def gen():
    data = []
    while 1:
        i = 0
        while i < 103:
            if len(data) < 10:
                data.append(i)
                i += 1
            else:
                yield data
                data = []

In [57]:
i = 0
for v in gen():
    if i <= 12:
        print(i, v)
        i += 1
    else:
        break

0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
1 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
2 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3 [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
4 [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
5 [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
6 [60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
7 [70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
8 [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
9 [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
10 [100, 101, 102, 0, 1, 2, 3, 4, 5, 6]
11 [7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
12 [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
