In [4]:
import tensorflow as tf
import numpy as np 
import sklearn
from sklearn import datasets
import pandas as pd
import glob
print(tf.__version__)

1.2.0


In [None]:
X, y = datasets.make_classification(n_classes=4,n_features=20,n_clusters_per_class=2,n_informative=15,n_samples=100000)

In [None]:
data = pd.DataFrame(X)

In [None]:
data['target'] = y

In [None]:
header = 'f01,f02,f03,f04,f05,f06,f07,f08,f09,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,target'.split(',')
len(header)

In [None]:
def save_data_files(partitions_count, pattern):
    for i in range(partitions_count):
        partition = data.sample(frac=0.15)
        partition.to_csv("data/gen-{}-{}.csv".format(pattern,str(i+1).zfill(2)),header=False)
        

In [None]:
save_data_files(10,"train")
save_data_files(5,"test")

In [None]:
%%bash

ls 'data'

In [3]:
HEADER = 'key,f01,f02,f03,f04,f05,f06,f07,f08,f09,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,target'.split(',')
DATA_TYPES = ['int'] + (['float']*20) + ['string']
DEFAULTS = [[1]]+([[0.0]]*20)+[['NA']] 
NUMERIC_FEATURE_NAMES = 'f01,f02,f03,f04,f05,f06,f07,f08,f09,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20'.split(',')
FEATURE_NAMES = NUMERIC_FEATURE_NAMES 
KEY_COLUMN_NAME = 'key'
TARGET_NAME = 'target'
TARGET_LABELS = '1,2,3,4'.split(',')

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def get_tfrecord_feature(i,value):
    data_type = DATA_TYPES[i]
    if data_type == "int":
        return  _int_feature(int(value))
    elif data_type == "float":
        return _float_feature(float(value))
    else:
        return _bytes_feature(value.tostring())
    
def get_tfrecord_features(df_row):
    
    tf_features = {HEADER[i]: get_tfrecord_feature(i,df_row[i])
                       for i in range(len(HEADER))}
    return tf_features


In [1]:
def load_dataframe(pattern):

    files = glob.glob("data/gen-{}-*.csv".format(pattern))

    df_list = []
    for file_path in files:
        df = pd.read_csv(file_path, index_col=None, header=None)
        df_list.append(df)

    frame = pd.DataFrame()
    frame = pd.concat(df_list)
    return frame

In [2]:
def convert_to_tfrecords(dataframe,pattern,partition):
    print(len(dataframe))
    with tf.python_io.TFRecordWriter("data/tf-{}-{}.tfr".format(pattern,partition)) as writer:
        for row in dataframe.values:
            fd = get_tfrecord_features(row)
            #print(fd)
            example = tf.train.Example(
                features = tf.train.Features(feature=fd)
            
            )
#             example.features.feature["features"].float_list.value.extend(features)
#             example.features.feature["label"].bytes_list.value.append(str(label))
            writer.write(example.SerializeToString())

In [5]:
train_data = load_dataframe("train")
len(train_data.values)


150000

In [6]:
def save_data_as_tfrecords(dataframe, pattern, partition_count):
    batch_size = len(dataframe)/partition_count
    for i in range(partition_count):
        start_index = (i*partition_count)
        end_index = (i*partition_count) + batch_size
        if end_index > len(dataframe) - start_index:
            end_index = len(dataframe) - start_index
            
        convert_to_tfrecords(dataframe.iloc[int(start_index): int(end_index),:],pattern,str(i+1).zfill(2))
        print(i)

In [42]:
save_data_as_tfrecords(train_data,"train", 10)

15000
0
15000
1
15000
2
15000
3
15000
4
15000
5
15000


KeyboardInterrupt: 

In [None]:
test_data = load_dataframe("test")
len(test_data)

In [None]:
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    return table.lookup(label_string_tensor)

def csv_input_fn(filename, num_epochs=None, batch_size=512):
    
    input_file_names = tf.train.match_filenames_once(filename)
 
    filename_queue = tf.train.string_input_producer(
        input_file_names, num_epochs=num_epochs, shuffle=False)
    
    reader = tf.TextLineReader()
    _, value = reader.read_up_to(filename_queue, num_records=batch_size)

    value_column = tf.expand_dims(value, -1)
    columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)
    
    features = dict(zip(HEADER, columns))
    
    features.pop(KEY_COLUMN_NAME)
    
    target = features.pop(TARGET_NAME)    
    
    return features, parse_label_column(target)

In [None]:
def create_classifier(run_config,hparams):
  
    numeric_columns = [tf.feature_column.numeric_column(feature_name)
                       for feature_name in NUMERIC_FEATURE_NAMES]

    dnn_optimizer = tf.train.AdamOptimizer()
    
    estimator = tf.contrib.learn.DNNClassifier(
                feature_columns = numeric_columns,
                optimizer=dnn_optimizer,
                hidden_units=hparams.hidden_units,
                config = run_config
                )
    
    return estimator

In [None]:
train_data_files = "data/gen-train-*.csv"
eval_data_files = "data/gen-test-*.csv"

def create_experiment(run_config,hparams):
    
    estimator = create_classifier(run_config, hparams)
    
    evaluation_metrics={
    'accuracy': tf.contrib.learn.MetricSpec(
        metric_fn=tf.metrics.accuracy, 
        prediction_key="classes",
        label_key=None)
    }
    
    experiment =  tf.contrib.learn.Experiment(estimator = estimator, 
                                     train_steps = hparams.training_steps,
                                     train_input_fn = lambda: csv_input_fn(train_data_files,
                                                                           num_epochs=hparams.num_epochs,
                                                                           batch_size = hparams.batch_size
                                                                          ), 
                                     eval_input_fn =lambda: csv_input_fn(eval_data_files),
                                     eval_metrics = evaluation_metrics
                                    )
    return experiment

In [None]:
import shutil

# Set params
hparams  = tf.contrib.training.HParams(training_steps=None,
                                       num_epochs = 10,
                                       batch_size = 5000,
                                       hidden_units=[16, 8])


local_model_dir = "trained_models/demo_classifier"
shutil.rmtree(local_model_dir, ignore_errors=True)

run_config = tf.contrib.learn.RunConfig(
    model_dir=local_model_dir
)

# Run the experiment
tf.logging.set_verbosity(tf.logging.INFO)
tf.contrib.learn.learn_runner.run(experiment_fn = create_experiment, 
                               run_config = run_config,
                               schedule="train_and_evaluate",
                               hparams=hparams)