In [46]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

tf.random.set_seed(123)


In [47]:
raw_dataset = pd.read_csv("tier6.csv")
df = raw_dataset.copy()
df = df.dropna(subset=["divsTime"])
df = df[["qualified", "isRelay", "clipped_divsRank", "normed_divsSpeed", "normed_divsTimePctOfMean", "stroke", "points"]]
df.isna().sum()


qualified                   0
isRelay                     0
clipped_divsRank            0
normed_divsSpeed            0
normed_divsTimePctOfMean    0
stroke                      0
points                      0
dtype: int64

In [48]:

# df.qualified = df.qualified.astype(int)
# df.isRelay = df.isRelay.astype(int)
df.dtypes

qualified                      bool
isRelay                        bool
clipped_divsRank            float64
normed_divsSpeed            float64
normed_divsTimePctOfMean    float64
stroke                       object
points                        int64
dtype: object

In [49]:
dftrain = df.sample(frac=0.8,random_state=0)
dfeval = df.drop(dftrain.index)
y_train = dftrain.pop('points')
y_eval = dfeval.pop('points')

In [50]:
fc = tf.feature_column
CATEGORICAL_COLUMNS = ['stroke', 'qualified', 'isRelay']
NUMERIC_COLUMNS = ['clipped_divsRank', 'normed_divsSpeed', 'normed_divsTimePctOfMean']

def one_hot_cat_column(feature_name, vocab):
    return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name, vocab)
    )

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    # Need to one-hot encode categorical features.
    vocabulary = df[feature_name].unique()
    feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(fc.numeric_column(feature_name,
                                              dtype=tf.float32))

ValueError: column_name: qualified vocabulary dtype must be string or integer. dtype: <dtype: 'bool'>.

In [0]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
    if shuffle:
      dataset = dataset.shuffle(NUM_EXAMPLES)
    # For training, cycle thru dataset as many times as need (n_epochs=None).
    dataset = (dataset
      .repeat(n_epochs)
      .batch(NUM_EXAMPLES))
    return dataset
  return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)

In [0]:
params = {
  'n_trees': 50,
  'max_depth': 3,
  'n_batches_per_layer': 1,
  # You must enable center_bias = True to get DFCs. This will force the model to
  # make an initial prediction before using any features (e.g. use the mean of
  # the training labels for regression or log odds for classification when
  # using cross entropy loss).
  'center_bias': True
}

est = tf.estimator.BoostedTreesClassifier(feature_columns, **params)
# Train model.
est.train(train_input_fn, max_steps=100)

# Evaluation.
results = est.evaluate(eval_input_fn)
pd.Series(results).to_frame()