In [16]:
import tensorflow as tf


In [17]:
# Index(['userid', 'albumid', 'id', 'actor_x', 'area_x', 'channelid_x', 'cpid_x',
#        'director_x', 'language_x', 'paytype_x', 'score', 'tag_x', 'year',
#        'updatetype', 'actor_y', 'director_y', 'area_y', 'channelid_y',
#        'cpid_y', 'tag_y', 'paytype_y', 'language_y', 'score_x', 'score_y',
#        'year_x', 'year_y'],
#       dtype='object')

_FEATURE_LEN = {
    'language': 20,
    'channelid': 16,
    'paytype': 2,
    'updatetype': 2,
    'cpid': 6,
    'area': 187,
    'director': 15124,
    'actor': 15231,
    'tag': 9430
}

_FEATURE_EMBEDDING_SIZE = {
    'language': 2,
    'channelid': 2,
    'paytype': 2,
    'updatetype': 2,
    'cpid': 4,
    'area': 16,
    'director': 32,
    'actor': 32,
    'tag': 32
}


updatetype = tf.feature_column.categorical_column_with_vocabulary_list(
            'updatetype', range(_FEATURE_LEN['updatetype']))
channelid = tf.feature_column.categorical_column_with_vocabulary_list(
            'channelid_x', range(_FEATURE_LEN['channelid']))
area = tf.feature_column.categorical_column_with_vocabulary_list(
            'area_x', range(_FEATURE_LEN['area']))
cpid = tf.feature_column.categorical_column_with_vocabulary_list(
            'cpid_x', range(_FEATURE_LEN['cpid']))
actor = tf.feature_column.categorical_column_with_vocabulary_list(
            'actor_x', range(_FEATURE_LEN['actor']))
director = tf.feature_column.categorical_column_with_vocabulary_list(
            'director_x', range(_FEATURE_LEN['director']))
language = tf.feature_column.categorical_column_with_vocabulary_list(
            'language_x', range(_FEATURE_LEN['language']))
paytype = tf.feature_column.categorical_column_with_vocabulary_list(
            'paytype_x', range(_FEATURE_LEN['paytype']))

tag = tf.feature_column.categorical_column_with_vocabulary_list(
            'tag_x', range(_FEATURE_LEN['tag']))
year = tf.feature_column.numeric_column('year')


channelid1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'channelid_y', range(_FEATURE_LEN['channelid']))
area1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'area_y', range(_FEATURE_LEN['area']))
cpid1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'cpid_y', range(_FEATURE_LEN['cpid']))
actor1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'actor_y', range(_FEATURE_LEN['actor']))
director1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'director_y', range(_FEATURE_LEN['director']))
language1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'language_y', range(_FEATURE_LEN['language']))
paytype1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'paytype_y', range(_FEATURE_LEN['paytype']))

tag1 = tf.feature_column.categorical_column_with_vocabulary_list(
            'tag_y', range(_FEATURE_LEN['tag']))

score = tf.feature_column.numeric_column('score')
score_x = tf.feature_column.numeric_column('score_x')
score_y = tf.feature_column.numeric_column('score_y')

year = tf.feature_column.numeric_column('year')
year_x = tf.feature_column.numeric_column('year_x')
year_y = tf.feature_column.numeric_column('year_y')


deep_columns = [    
    score,
    score_x,
    score_y,
    year,
    year_x,
    year_y
]

emb_columns = [
    tf.feature_column.shared_embedding_columns
        ([channelid, channelid1], dimension=_FEATURE_EMBEDDING_SIZE['channelid']),
    tf.feature_column.shared_embedding_columns
        ([area, area1], dimension=_FEATURE_EMBEDDING_SIZE['area']),
    tf.feature_column.shared_embedding_columns
        ([cpid, cpid1], dimension=_FEATURE_EMBEDDING_SIZE['cpid']),
    tf.feature_column.shared_embedding_columns
        ([actor, actor1], dimension=_FEATURE_EMBEDDING_SIZE['actor']),
    tf.feature_column.shared_embedding_columns
        ([director, director1], dimension=_FEATURE_EMBEDDING_SIZE['director']),
    tf.feature_column.shared_embedding_columns
        ([language, language1], dimension=_FEATURE_EMBEDDING_SIZE['language']),
    tf.feature_column.shared_embedding_columns
        ([paytype, paytype1], dimension=_FEATURE_EMBEDDING_SIZE['paytype']),
    tf.feature_column.shared_embedding_columns
        ([tag, tag1], dimension=_FEATURE_EMBEDDING_SIZE['tag'])
]

for col in emb_columns:
    deep_columns.extend(col)


base_columns = [
    channelid,
    area,
    cpid,
    actor,
    director,
    language,
    paytype,
    channelid1,
    area1,
    cpid1,
    actor1,
    director1,
    language1,
    paytype1,
]

_HASH_BUCKET_SIZE = 1024

cross_columns = [
    tf.feature_column.crossed_column(
    ['actor_x', 'actor_y'], hash_bucket_size=_HASH_BUCKET_SIZE),
    tf.feature_column.crossed_column(
    ['director_x', 'director_y'], hash_bucket_size=_HASH_BUCKET_SIZE),
    tf.feature_column.crossed_column(
    ['cpid_x', 'cpid_y'], hash_bucket_size=_HASH_BUCKET_SIZE),
    tf.feature_column.crossed_column(
    ['tag_x', 'tag_y'], hash_bucket_size=_HASH_BUCKET_SIZE)
]

wide_columns = base_columns + cross_columns 




In [18]:
import pandas as pd
train_data = pd.read_json('train_data')

In [19]:
# Index(['userid', 'albumid', 'id', 'actor_x', 'area_x', 'channelid_x', 'cpid_x',
#        'director_x', 'language_x', 'paytype_x', 'score', 'tag_x', 'year',
#        'updatetype', 'actor_y', 'director_y', 'area_y', 'channelid_y',
#        'cpid_y', 'tag_y', 'paytype_y', 'language_y', 'score_x', 'score_y',
#        'year_x', 'year_y'],
#       dtype='object')

In [20]:
MULTI_FEATURE_COLUMNS = {'actor_x', 'area_x', 'channelid_x', 'cpid_x', 
                         'director_x', 'language_x', 'paytype_x', 'tag_x', 'year',
                         'updatetype', 'actor_y', 'director_y', 'area_y', 'channelid_y',
                         'cpid_y', 'tag_y', 'paytype_y', 'language_y'}
VALUE_FEATURE_COLUMNS = {'score', 'year', 'score_x', 'score_y','year_x', 'year_y', 'label'}


In [21]:
train_data.drop_duplicates()
train_data = train_data.head(3000)

In [22]:
data_dict = {}

In [23]:
for key in MULTI_FEATURE_COLUMNS:
    l = [str(v).split(',') for v in train_data[key].tolist()]
    l = [[int(v) for v in vv] for vv in l]
    maxl = max([len(word) for word in l])
    print('{} + {}'.format(key, maxl))
    for v in l:
        while len(v) < maxl:
            v.append(-1)
    data_dict[key] = l

director_y + 598
actor_x + 20
area_x + 2
tag_y + 1170
tag_x + 13
cpid_x + 1
cpid_y + 598
channelid_x + 1
language_x + 1
director_x + 2
paytype_x + 1
year + 1
actor_y + 598
channelid_y + 598
area_y + 598
paytype_y + 598
updatetype + 1
language_y + 598


In [24]:
for key in VALUE_FEATURE_COLUMNS:
    data_dict[key] = train_data[key].tolist()

In [25]:
hidden_units = [16, 20]

  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
  session_config=tf.ConfigProto(device_count={'GPU': 0}))
model_dir = './modeldir'


model = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    n_classes=2,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=hidden_units,
    config=run_config)

INFO:tensorflow:Using config: {'_model_dir': './modeldir', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
  value: 1
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001AC1464C080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [26]:
test_samples = len(data_dict['actor_x'])

In [27]:
test_samples = 18465
label = [1 for _ in range(3000)]

In [28]:
# dataset = tf.data.Dataset.from_tensors((data_dict, label))

In [29]:
def input_fn_train():
    dataset = tf.data.Dataset.from_tensor_slices((data_dict, label))
    dataset = dataset.batch(30).repeat()
    print(dataset)
    return dataset.make_one_shot_iterator().get_next()

In [31]:
import gc
gc.collect()

model.train(input_fn=input_fn_train, steps=2000)

<RepeatDataset shapes: ({director_y: (?, 598), actor_x: (?, 20), area_x: (?, 2), tag_y: (?, 1170), tag_x: (?, 13), cpid_x: (?, 1), cpid_y: (?, 598), channelid_x: (?, 1), language_x: (?, 1), director_x: (?, 2), paytype_x: (?, 1), year: (?,), actor_y: (?, 598), channelid_y: (?, 598), area_y: (?, 598), paytype_y: (?, 598), updatetype: (?, 1), language_y: (?, 598), year_y: (?,), year_x: (?,), score_y: (?,), score: (?,), score_x: (?,), label: (?,)}, (?,)), types: ({director_y: tf.int32, actor_x: tf.int32, area_x: tf.int32, tag_y: tf.int32, tag_x: tf.int32, cpid_x: tf.int32, cpid_y: tf.int32, channelid_x: tf.int32, language_x: tf.int32, director_x: tf.int32, paytype_x: tf.int32, year: tf.int32, actor_y: tf.int32, channelid_y: tf.int32, area_y: tf.int32, paytype_y: tf.int32, updatetype: tf.int32, language_y: tf.int32, year_y: tf.int32, year_x: tf.float32, score_y: tf.float32, score: tf.int32, score_x: tf.float32, label: tf.int32}, tf.int32)>
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:D

KeyboardInterrupt: 

In [3]:
dataset = tf.data.Dataset.from_tensors({'key': [[1], [2]], 'value': [[2], [3]]})

In [4]:
dataset.repeat(32)

<RepeatDataset shapes: {key: (2, 1), value: (2, 1)}, types: {key: tf.int32, value: tf.int32}>

In [5]:
dataset.batch(32)

<BatchDataset shapes: {key: (?, 2, 1), value: (?, 2, 1)}, types: {key: tf.int32, value: tf.int32}>

In [6]:
import pandas as pd

In [30]:
data = pd.read_json('/Users/qianjay/video_parsed.dat')
data.head()



Unnamed: 0,Updatetype,actor,albumname,area,channelid,cpid,director,id,language,paytype,score,tag,year
0,0,9850,庆元明代银矿探秘,65,0,0,9675,02|3294913,0,0,7.6,7676,2017
1,0,9850,习近平就俄罗斯军机坠毁事件向普京致慰问电,65,1,0,9675,02|3164963,0,0,8.2,6478875,2016
2,0,9850,走一线看经济 大有可为的甜蜜事业,65,1,0,9675,02|3561132,0,0,8.0,0,2018
3,0,9850,壹起去旅行,65,1,0,9675,02|12271,1,0,8.0,58917379,2016
4,0,9850,德云社：桃花女破周公（郭德纲）,65,1,0,4128,02|966626,2,0,6.1,8976,2013


In [31]:
data = data[['id', 'actor', 'Updatetype', 'area', 'channelid', 'cpid', 'director', 'language', 'paytype', 'score', 'tag', 'year']]
data['updatetype'] = data['Updatetype']
data = data.drop(['Updatetype'], axis=1)

In [32]:
data.head()

Unnamed: 0,id,actor,area,channelid,cpid,director,language,paytype,score,tag,year,updatetype
0,02|3294913,9850,65,0,0,9675,0,0,7.6,7676,2017,0
1,02|3164963,9850,65,1,0,9675,0,0,8.2,6478875,2016,0
2,02|3561132,9850,65,1,0,9675,0,0,8.0,0,2018,0
3,02|12271,9850,65,1,0,9675,1,0,8.0,58917379,2016,0
4,02|966626,9850,65,1,0,4128,2,0,6.1,8976,2013,0


In [33]:
SINGLE_FEATURE_COLUMN = {'channelid', 'cpid', 'language', 'paytype'}
data_dict = {}
for key in SINGLE_FEATURE_COLUMN:
    l = [[v] for v in data[key].tolist()]
    data_dict[key] = l

In [34]:
dataset = tf.data.Dataset.from_tensors(data_dict)

In [35]:
dataset

<TensorDataset shapes: {paytype: (171962, 1), language: (171962, 1), cpid: (171962, 1), channelid: (171962, 1)}, types: {paytype: tf.int32, language: tf.int32, cpid: tf.int32, channelid: tf.int32}>

In [36]:
MULTI_FEATURE_COLUMNS = {'actor', 'area', 'director', 'tag'}
for key in MULTI_FEATURE_COLUMNS:
    l = [str(v).split(',') for v in data[key].tolist()]
    l = [[int(v) for v in vv] for vv in l]
    maxl = max([len(word) for word in l])
    print(maxl)
    for v in l:
        while len(v) < maxl:
            v.append(-1)
    data_dict[key] = l




54
63
8
23


In [38]:
data_dict.keys()

dict_keys(['paytype', 'language', 'cpid', 'channelid', 'actor', 'director', 'area', 'tag'])

In [28]:
dataset = tf.data.Dataset.from_tensors(data_dict)

In [29]:
dataset

<TensorDataset shapes: {paytype: (171962, 1), language: (171962, 1), cpid: (171962, 1), channelid: (171962, 1), actor: (171962, 54), director: (171962, 63), area: (171962, 8), tag: (171962, 23)}, types: {paytype: tf.int32, language: tf.int32, cpid: tf.int32, channelid: tf.int32, actor: tf.int32, director: tf.int32, area: tf.int32, tag: tf.int32}>

In [25]:
data.head()

Unnamed: 0,id,actor,area,channelid,cpid,director,language,paytype,score,tag,year,updatetype
0,02|3294913,9850,65,0,0,9675,0,0,7.6,7676,2017,0
1,02|3164963,9850,65,1,0,9675,0,0,8.2,6478875,2016,0
2,02|3561132,9850,65,1,0,9675,0,0,8.0,0,2018,0
3,02|12271,9850,65,1,0,9675,1,0,8.0,58917379,2016,0
4,02|966626,9850,65,1,0,4128,2,0,6.1,8976,2013,0


In [22]:
[len(str(v).split(',')) for v in data['actor']]

for v in data['actor']:
    if len(str(v).split(',')) == 54:
        print(v)


9940,6485,4376,1209,14021,8288,13115,9043,6068,6717,1899,5097,12724,8364,3038,895,383,9632,14989,5135,4738,4601,13202,2021,13121,5084,6393,2396,2793,5392,2372,12470,9154,4031,865,12190,7232,9569,3144,14654,13306,9136,9905,1003,6204,974,9887,9581,559,9300,3814,12608,10994,4992
