In [1]:
import tensorflow as tf


feature_description = { # 定义Feature结构，告诉解码器每个Feature的类型是什么
    'id': tf.io.FixedLenFeature([], tf.string),
    'tag_id': tf.io.VarLenFeature(tf.int64),
    'category_id': tf.io.FixedLenFeature([], tf.int64),
    'title': tf.io.FixedLenFeature([], tf.string),
    'asr_text': tf.io.FixedLenFeature([], tf.string),
    'frame_feature': tf.io.VarLenFeature(tf.string)
}


def read_and_decode(example_string):
    '''
    从TFrecord格式文件中读取数据 train
    '''
    feature_dict = tf.io.parse_single_example(example_string, feature_description)
    frame_feature = tf.sparse.to_dense(feature_dict['frame_feature']).numpy()
    title = feature_dict['title'].numpy()
    asr_text = feature_dict['asr_text'].numpy()
    id = feature_dict['id'].numpy()
    tag_id = tf.sparse.to_dense(feature_dict['tag_id']).numpy()
    category_id = feature_dict['category_id'].numpy()


    return id, tag_id, category_id, frame_feature, title, asr_text

import glob
def get_all_data(path): # 'data/pairwise'
    filenames = glob.glob(path)
    print(filenames)
    dataset = tf.data.TFRecordDataset(filenames)
    datas = {}
    for i, data in enumerate(dataset):
        id, tag_id, category_id, frame_feature, title, asr_text = read_and_decode(data)
        id = id.decode()
        datas[id] = {'tag_id': tag_id, 'category_id': category_id, 'frame_feature': frame_feature, 'title': title, 'asr_text': asr_text}
        # print(id)
        # print(datas['2345203561710400875']['asr_text'])
        # break
        # if i % 10000 == 0 and i > 0:
        #     break
    return datas  

datas = get_all_data('data/pairwise/pairwise.tfrecords')

2021-09-18 12:58:37.463120: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


['data/pairwise/pairwise.tfrecords']


2021-09-18 12:58:38.492991: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-09-18 12:58:38.498273: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-18 12:58:38.498667: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.6325GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidth: 451.17GiB/s
2021-09-18 12:58:38.498703: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-18 12:58:38.501770: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-09-18 12:58:38.501993: I tensorflow/stream_executor/platf

In [2]:
len(datas)

63613

In [7]:
label_path = 'data/pairwise/label.tsv'
f = open(label_path)
all_pair_data = []
for line in f:
    id_1, id_2, sim = line.strip().split('\t')
    sim = float(sim)
    all_pair_data.append([id_1, id_2, sim])

In [4]:
# label_path_sup = 'data/pairwise/label_sup_0.9'
# f = open(label_path_sup)
# all_pair_data_sup = []
# for line in f:
#     id_1, id_2, sim = line.strip().split('\t')
#     sim = float(sim)
#     all_pair_data_sup.append([id_1, id_2, sim])

In [8]:
# shuffle pair data and get the top 6000 for validation
import random

random.seed(42)
# print(all_pair_data[:10])
random.shuffle(all_pair_data)
# print(all_pair_data[:10])


In [9]:
start, end = 12000, 18000
val_pair_data = all_pair_data[start:end]
train_pair_data = all_pair_data[:start]+all_pair_data[end:]#+all_pair_data_sup

In [10]:
len(train_pair_data)

61899

In [11]:
random.shuffle(train_pair_data)

In [14]:
from tqdm import tqdm

def write_tfrecord(pair_datas, split):
    write_path = 'data/pairwise/6000-11999val/'+split+'.tfrecord'
    writer = tf.io.TFRecordWriter(write_path) 
    for pair_data in tqdm(pair_datas): # [id_1, id_2, sim] [str, str, float]
        id_1, id_2, sim = pair_data
        tag_id_1 = datas[id_1]['tag_id']
        category_id_1 = datas[id_1]['category_id']
        frame_feature_1 = datas[id_1]['frame_feature'].tolist()
        title_1 = datas[id_1]['title']
        asr_text_1 = datas[id_1]['asr_text']

        tag_id_2 = datas[id_2]['tag_id']
        category_id_2 = datas[id_2]['category_id']
        frame_feature_2 = datas[id_2]['frame_feature'].tolist()
        title_2 = datas[id_2]['title']
        asr_text_2 = datas[id_2]['asr_text']
        feature = {                             # 建立 tf.train.Feature 字典
            'id_1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(id_1.encode())])),  
            'tag_id_1': tf.train.Feature(int64_list=tf.train.Int64List(value=list(tag_id_1))),
            'frame_feature_1': tf.train.Feature(bytes_list=tf.train.BytesList(value=frame_feature_1)),
            'category_id_1': tf.train.Feature(int64_list=tf.train.Int64List(value=[category_id_1])),   
            'title_1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[title_1])),
            'asr_text_1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[asr_text_1])),
            'id_2': tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(id_2.encode())])),  
            'tag_id_2': tf.train.Feature(int64_list=tf.train.Int64List(value=list(tag_id_2))),
            'frame_feature_2': tf.train.Feature(bytes_list=tf.train.BytesList(value=frame_feature_2)),
            'category_id_2': tf.train.Feature(int64_list=tf.train.Int64List(value=[category_id_2])),   
            'title_2': tf.train.Feature(bytes_list=tf.train.BytesList(value=[title_2])),
            'asr_text_2': tf.train.Feature(bytes_list=tf.train.BytesList(value=[asr_text_2])),
            'sim': tf.train.Feature(float_list=tf.train.FloatList(value=[sim])) 
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString()) 
    writer.close()



In [15]:
write_tfrecord(val_pair_data, 'val')
write_tfrecord(train_pair_data, 'train')

100%|██████████| 6000/6000 [00:01<00:00, 3726.50it/s]
100%|██████████| 61899/61899 [00:17<00:00, 3534.92it/s]


In [16]:
filename = 'data/pairwise/6000-11999val/val.tfrecord'
dataset = tf.data.TFRecordDataset(filename)
for i, line in enumerate(dataset):
    # print(line)
    example_proto = tf.train.Example.FromString(line.numpy())
    print(example_proto)
    break

features {
  feature {
    key: "asr_text_1"
    value {
      bytes_list {
        value: "nan"
      }
    }
  }
  feature {
    key: "asr_text_2"
    value {
      bytes_list {
        value: "nan"
      }
    }
  }
  feature {
    key: "category_id_1"
    value {
      int64_list {
        value: 12202
      }
    }
  }
  feature {
    key: "category_id_2"
    value {
      int64_list {
        value: 12202
      }
    }
  }
  feature {
    key: "frame_feature_1"
    value {
      bytes_list {
        value: "\221$\376\255b\260\332\255]2s6\306\260\360\246\237\254\346\253>44\261\246\260\032\250\253=\216\251!\260\003\253\343\257\3429\300\261\t\257|\256|4\006\262\024\260\272\230O)\n(\020$\034\262\017A\2049--\3331T0\323\253`0\202\256\215 \014.\335-\031\261p8+\260\202\262!\255^\261\330\250@\254\003\2613\262\301\254.\260\307\261\254;\025\260\271+\000\255\370\240O\260G\244\243\261C\262[\260\344.\326\260\337\260\0001\016+\2614\247\261\332\234\001\257T-\315\251\276\260\375\260<\261R\260\257