## Getting tensorflow libraries

In [1]:
from __future__ import print_function
import numpy as np
import sklearn
import pandas as pd
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources


## Import & examine the data

In [2]:
data = pd.read_csv('../sample10k.csv')
data.describe()

Unnamed: 0,target,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.2442,0.009014,0.011106,-0.013367,0.001166,0.019157,0.047894,-0.025862,0.04038,0.034424,0.039972,0.018248,-0.042597
std,0.429633,3.001119,3.016907,2.972178,2.996838,2.992047,3.019129,2.989788,2.963178,3.015606,3.024072,2.980559,3.010707
min,0.0,-11.404385,-10.949152,-11.462833,-10.447853,-10.523343,-10.766031,-11.593821,-10.483454,-11.57927,-11.482517,-11.317906,-12.083288
25%,0.0,-1.971925,-2.011275,-2.005898,-1.994404,-1.980998,-2.008118,-2.018688,-2.030944,-1.96992,-2.016859,-2.001497,-2.039749
50%,0.0,0.025727,-0.01085,0.003183,0.025553,0.026929,0.051229,-0.007793,0.046498,0.037163,0.047354,0.006015,-0.041865
75%,0.0,1.979395,2.079668,1.942752,2.016147,2.014459,2.086942,2.007133,2.069759,2.080392,2.060793,2.035454,1.976383
max,1.0,12.344934,12.945843,11.292419,11.284082,11.0931,10.605325,12.38468,10.470135,11.059739,10.356222,11.403303,11.760443


## Split the Data

In [3]:
from sklearn.model_selection import train_test_split
input_x = data.iloc[:, 2:].values
input_y = data.iloc[:, 1].values


X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, test_size = 0.40, random_state = 0)
print("Size of training:   ", len(X_train))
print("Size of evaluation: ",len(X_test))
print("\nFirst row of training predictors (numpy) array:")
print(X_train[1])

Size of training:    6000
Size of evaluation:  4000

First line of training predictors:
[ 4.35532638  7.69822162  4.05488623 -0.05420009 -0.86838503  2.85506864
  4.23377396 -0.38211895 -1.38040518  5.71197681  4.73818419 -0.99729902]


## Set up the tensorflow RF graph 

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Parameters
num_steps = 100 # Total steps to train
num_classes = 2 
num_features = 12
num_trees = 500 
max_nodes = 20 

# Input and Target placeholders 
X = tf.placeholder(tf.float32, shape=[None, num_features])
Y = tf.placeholder(tf.int64, shape=[None])

# Random Forest Parameters
hparams = tensor_forest.ForestHParams(num_classes=num_classes, 
  num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill()

# Build the Random Forest
#tf.reset_default_graph()
forest_graph = tensor_forest.RandomForestGraphs(hparams)

# Get training graph and loss
train_op = forest_graph.training_graph(X, Y)
loss_op = forest_graph.training_loss(X, Y)



INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_trees': 500, 'max_nodes': 20, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 10, 'max_fertile_nodes': 0, 'split_after_samples': 250, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'num_classes': 2, 'num_features': 12, 'bagged_num_features': 12, 'bagged_features': None, 'regression': False, 'num_outputs': 1, 'num_output_columns': 3, 'base_random_seed': 0, 'leaf_model_type': 0, 'stats_model_type': 0, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}


In [15]:
# Measure the accuracy
infer_op, _, _ = forest_graph.inference_graph(X)
correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


Tensor("probabilities_5:0", shape=(?, ?), dtype=float32, device=/device:CPU:0)
Tensor("strided_slice:0", shape=(?,), dtype=float32)


## Training the Model

In [6]:
# Initialize the variables (i.e. assign their default value) and forest resources
init_vars = tf.group(tf.global_variables_initializer(),
    resources.initialize_resources(resources.shared_resources()))

# Start TensorFlow session
sess = tf.Session()

# Run the initializer
sess.run(init_vars)

for i in range(1, num_steps + 1):
    _, l = sess.run([train_op, loss_op], feed_dict={X: X_train, Y: y_train})
    if i % 50 == 0 or i == 1:
        acc = sess.run(accuracy_op, feed_dict={X: X_train, Y: y_train})
        print('Step %i, Loss: %f, Acc: %f' % (i, l, acc))


Step 1, Loss: -1.000000, Acc: 0.750167
Step 50, Loss: -21.000000, Acc: 0.750167
Step 100, Loss: -21.000000, Acc: 0.750167


## Evaluating AUC

In [16]:
# Test Model
probs, _, _ = forest_graph.inference_graph(X_test)

print("Test Accuracy:", sess.run(accuracy_op, feed_dict={X: X_test, Y: y_test}))
#print("Test AUC": ,sess.run(auc_op, feed_dict={X: X_test, Y: y_test}))



TypeError: Input 'input_data' of 'TreePredictionsV4' Op has type float64 that does not match expected type of float32.