## tensorflow特征工程演示

In [1]:
import tensorflow as tf
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
import collections
from tensorflow.python.framework import errors
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
from tensorflow import feature_column

from tensorflow.python.feature_column.feature_column import _LazyBuilder

### bucketized_column

In [2]:
def test_bucketized_column():

    price = {'price': [[5.], [15.], [25.], [35.]]}  # 4行样本

    price_column = feature_column.numeric_column('price')
    bucket_price = feature_column.bucketized_column(price_column, [0, 10, 20, 30, 40])

    price_bucket_tensor = feature_column.input_layer(price, [bucket_price])

    with tf.Session() as session:
        print(session.run([price_bucket_tensor]))

In [3]:
test_bucketized_column()

[array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)]


### categorical_column_with_vocabulary_list

In [4]:
def test_categorical_column_with_vocabulary_list():

    color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]}  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))

In [5]:
test_categorical_column_with_vocabulary_list()

[SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1],
       [3, 0],
       [3, 1]]), values=array([ 0,  0,  1,  0,  2,  1, -1, -1]), dense_shape=array([4, 2]))]
use input_layer________________________________________
[array([[2., 0., 0.],
       [1., 1., 0.],
       [0., 1., 1.],
       [0., 0., 0.]], dtype=float32)]


### categorical_column_with_hash_bucket

In [6]:
def test_categorical_column_with_hash_bucket():

    color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_hash_bucket('color', 7)

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))

In [7]:
test_categorical_column_with_hash_bucket()

[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]]), values=array([5, 2, 6, 3]), dense_shape=array([4, 1]))]
use input_layer________________________________________
[array([[0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0.]], dtype=float32)]


### embedding_column

In [8]:
def test_embedding():
    color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_embeding = feature_column.embedding_column(color_column, 8)
    color_embeding_dense_tensor = feature_column.input_layer(color_data, [color_embeding])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('embeding' + '_' * 40)
        print(session.run([color_embeding_dense_tensor]))

In [9]:
test_embedding()

embeding________________________________________
[array([[-0.48452854,  0.00072224,  0.47728175,  0.58868843, -0.29272023,
        -0.24645145, -0.11571419, -0.0120658 ],
       [-0.08538716,  0.0464833 , -0.04951673, -0.07914135,  0.31585735,
         0.29965273, -0.01657117,  0.07479684],
       [ 0.13528614, -0.07837002, -0.5130413 , -0.4060412 , -0.21372862,
        -0.12371349,  0.38646778, -0.1806267 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ]], dtype=float32)]


### weighted_categorical_column

In [10]:
def test_weighted_categorical_column():
    color_data = {'color': [['R'], ['G'], ['B'], ['A']],
                  'weight': [[1.0], [2.0], [4.0], [8.0]]}  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_weight_categorical_column = feature_column.weighted_categorical_column(color_column, 'weight')

    builder = _LazyBuilder(color_data)

    with tf.Session() as session:
        id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(builder)

        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('weighted categorical' + '-' * 40)

        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))

In [11]:
test_weighted_categorical_column()

weighted categorical----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]]), values=array([ 0,  1,  2, -1]), dense_shape=array([4, 1]))]
----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]]), values=array([1., 2., 4., 8.], dtype=float32), dense_shape=array([4, 1]))]


### crossed_column

In [12]:
def test_crossed_column():
    """
    crossed column测试
    :return:
    """
    featrues = {
        'price': [['A', 'A'], ['B', 'D'], ['C', 'A']],
        'color': [['R', 'R'], ['G', 'G'], ['B', 'B']]
    }

    price = feature_column.categorical_column_with_vocabulary_list('price',
                                                                   ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list('color',
                                                                   ['R', 'G', 'B'])
    p_x_c = feature_column.crossed_column([price, color], 16)

    p_x_c_identy = feature_column.indicator_column(p_x_c)

    p_x_c_identy_dense_tensor = feature_column.input_layer(featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([p_x_c_identy_dense_tensor]))

In [13]:
test_crossed_column()

use input_layer________________________________________
[array([[0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 2., 0.]],
      dtype=float32)]
