In [1]:
import tensorflow as tf
from tensorflow import feature_column

In [25]:
"""
numeric_column(
    key,
    shape=(1,),
    default_value=None,
    dtype=tf.float32,
    normalizer_fn=None
)
通过key对特征进行区别，产生能够将数值特征直接转化成tensor的feature_column
对数值特征可以进行自定义的normalizer_fn映射

对于default_value这个参数不甚理解
"""
samples = {
    'price':[[1.], [2.], [3.], [4.]],
    'other':[[6.], [7.], [10.], [1.]]
}

price_column = feature_column.numeric_column('price', normalizer_fn=lambda x: x*2)
other_column = feature_column.numeric_column('other', default_value=-1., normalizer_fn=lambda x: x-2)

columns=[price_column, other_column]
tensor = feature_column.input_layer(samples, columns)

with tf.Session() as sess:
    print(sess.run([tensor]))

[array([[ 4.,  2.],
       [ 5.,  4.],
       [ 8.,  6.],
       [-1.,  8.]], dtype=float32)]


In [28]:
"""
tf.feature_column.bucketized_column(
    source_column,
    boundaries
)
分桶之后将维度从1升到了2
"""
samples['years'] = [1967, 1988, 1999, 2017]
years_numeric_column = feature_column.numeric_column('years')
years_bucketized_column = feature_column.bucketized_column(years_numeric_column, boundaries=[1980, 1990, 2000])

columns = [price_column, other_column, years_bucketized_column]
tensor = feature_column.input_layer(samples, columns)

with tf.Session() as sess:
    print(sess.run([tensor]))

[array([[ 4.,  2.,  1.,  0.,  0.,  0.],
       [ 5.,  4.,  0.,  1.,  0.,  0.],
       [ 8.,  6.,  0.,  0.,  1.,  0.],
       [-1.,  8.,  0.,  0.,  0.,  1.]], dtype=float32)]


In [35]:
"""
categorical_column_with_identity(
    key,
    num_buckets,
    default_value=None
)
"""

samples['pets']=[[2,3], [3, 3], [0, 2], [1, 1]]

pets_column = feature_column.categorical_column_with_identity(key='pets', num_buckets=5)
# indicator_column接受categorical_column作为参数，直接传categorical_column会报错
# multi-hot representation of given categorical column
pets_indicator_column = feature_column.indicator_column(pets_column)

columns = [pets_indicator_column]
tensor = feature_column.input_layer(samples, columns)

with tf.Session() as sess:
    print(sess.run([tensor]))

[array([[0., 0., 1., 1., 0.],
       [0., 0., 0., 2., 0.],
       [1., 0., 1., 0., 0.],
       [0., 2., 0., 0., 0.]], dtype=float32)]


In [40]:
"""
categorical_column_with_vocabulary_list(
    key,
    vocabulary_list,
    dtype=None,
    default_value=-1,
    num_oov_buckets=0
)
这个方法就是将一个单词列表生成为分类词汇特征列的
对于oov，采用hash方式觉得位置
"""
samples['pets']=[['rabbit','pig','dog','mouse','cat'], ['rabbit','dog','dog','mouse','cat']]

pets_vl_column = tf.feature_column.categorical_column_with_vocabulary_list(
    key='pets',
    vocabulary_list=['cat','dog','rabbit','pig'], 
    dtype=tf.string, 
    default_value=-1,
    num_oov_buckets=3)

indicator = tf.feature_column.indicator_column(pets_vl_column)
tensor = tf.feature_column.input_layer(samples, [indicator])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    print(session.run([tensor]))

[array([[1., 1., 1., 1., 0., 1., 0.],
       [1., 2., 1., 0., 0., 1., 0.]], dtype=float32)]


In [42]:
"""
categorical_column_with_hash_bucket(
    key,
    hash_bucket_size,
    dtype=tf.string
)
仍然是分箱，但是这一次我们更加关心“我希望有多少分类？”，也许我们有150个单词，但我们只希望分成100个分类，多下来50个的怎么处理？
取余数！101除以100余1，我们就把第101种单词也标记为1，和我们的第1种单词变成了同一类，如此类推，第102种和2种同属第2类,第103种和3种同属第3类...
这看起来是错误的，不过很多时候tensorflow还是能够利用其他的特征列把它们区分开。所以，为了有效减少内存和计算时间，可以这么做。
"""
samples['colors'] = ['green','red','blue','yellow','pink','blue','red','indigo']

colors_column = tf.feature_column.categorical_column_with_hash_bucket(
        key='colors',
        hash_bucket_size=5,
    )

indicator = tf.feature_column.indicator_column(colors_column)
tensor = tf.feature_column.input_layer(samples, [indicator])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    print(session.run([tensor]))


[array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)]


In [48]:
samples['longtitude']=[19,61,30,9,45]
samples['latitude']=[45,40,72,81,24]
longtitude = tf.feature_column.numeric_column('longtitude')
latitude = tf.feature_column.numeric_column('latitude')

longtitude_b_c = tf.feature_column.bucketized_column(longtitude, [33,66])
latitude_b_c  = tf.feature_column.bucketized_column(latitude,[33,66])

# crossed_column不理解
ll_column = tf.feature_column.crossed_column([longtitude_b_c, latitude_b_c], 12)

indicator = tf.feature_column.indicator_column(ll_column)
tensor = tf.feature_column.input_layer(samples, [indicator])
only_longtitude = tf.feature_column.input_layer(samples, [longtitude_b_c])
only_latitude = tf.feature_column.input_layer(samples, [latitude_b_c])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    print(session.run([tensor]))
    print(session.run([only_longtitude]))
    print(session.run([only_latitude]))


[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)]
[array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)]
[array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)]


In [51]:
"""
embedding_column(
    categorical_column,
    dimension,
    combiner='mean',
    initializer=None,
    ckpt_to_load_from=None,
    tensor_name_in_ckpt=None,
    max_norm=None,
    trainable=True
)
"""

pets_f_c = tf.feature_column.categorical_column_with_vocabulary_list(
    'pets',
    ['cat','dog','rabbit','pig'], 
    dtype=tf.string, 
    default_value=-1)

column = tf.feature_column.embedding_column(pets_f_c, 5)

indicator = tf.feature_column.indicator_column(pets_f_c)

tensor = tf.feature_column.input_layer(samples, [column])
one_hot_tensor = tf.feature_column.input_layer(samples, [indicator])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    print(session.run([tensor]))
    print(session.run([one_hot_tensor]))

[array([[-0.00882747, -0.05484571, -0.13822883, -0.04347168,  0.00400347],
       [ 0.22994633, -0.1435978 ,  0.18737102, -0.21302213,  0.25726706]],
      dtype=float32)]
[array([[1., 1., 1., 1.],
       [1., 2., 1., 0.]], dtype=float32)]
