In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import tempfile

In [20]:
df = pd.read_parquet("chocolate_ratings.parquet")
df.head()

Unnamed: 0,Company,SpecificBeanOriginOrBarName,REF,ReviewDate,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin
0,A. Morin,Agua Grande,1876,2016,0.63,France,3.75,unknown,Sao Tome
1,A. Morin,Kpime,1676,2015,0.7,France,2.75,unknown,Togo
2,A. Morin,Atsane,1676,2015,0.7,France,3.0,unknown,Togo
3,A. Morin,Akata,1680,2015,0.7,France,3.5,unknown,Togo
4,A. Morin,Quilla,1704,2015,0.7,France,3.5,unknown,Peru


In [21]:
df = df.drop(['REF','ReviewDate'], axis=1)
df.head(10)

Unnamed: 0,Company,SpecificBeanOriginOrBarName,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin
0,A. Morin,Agua Grande,0.63,France,3.75,unknown,Sao Tome
1,A. Morin,Kpime,0.7,France,2.75,unknown,Togo
2,A. Morin,Atsane,0.7,France,3.0,unknown,Togo
3,A. Morin,Akata,0.7,France,3.5,unknown,Togo
4,A. Morin,Quilla,0.7,France,3.5,unknown,Peru
5,A. Morin,Carenero,0.7,France,2.75,Criollo,Venezuela
6,A. Morin,Cuba,0.7,France,3.5,unknown,Cuba
7,A. Morin,Sur del Lago,0.7,France,3.5,Criollo,Venezuela
8,A. Morin,Puerto Cabello,0.7,France,3.75,Criollo,Venezuela
9,A. Morin,Pablino,0.7,France,4.0,unknown,Peru


In [126]:
min(df["Rating"]), max(df['Rating'])
df["RatingLabel"] = [int(x<=2.5) for x in df["Rating"]]
df.head(10)
min(df["RatingLabel"]), max(df['RatingLabel'])

(0, 1)

In [141]:
COLUMNS = ['CocoaPercent', 'RatingLabel', 'BeanType', 'Company']
LABEL_COLUMN = 'RatingLabel'
CATEGORICAL_COLUMNS = ['BeanType', 'Company']
CONTINUOUS_COLUMNS = ['CocoaPercent']

company = tf.feature_column.categorical_column_with_hash_bucket('Company', hash_bucket_size=1000)
bean_type = tf.feature_column.categorical_column_with_hash_bucket('BeanType', hash_bucket_size=1000)
cocoa_percent = tf.contrib.layers.real_valued_column("CocoaPercent")
cocoa_buckets = tf.contrib.layers.bucketized_column(cocoa_percent, boundaries=[40, 70, 100])

# use pandas functions to split data into training and test sets
df_train = df.sample(frac=0.8, random_state=int(time.time()))

# df.index: row labels
df_test = df.drop(df_train.index)

def input_fn(df):
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        dense_shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS}
    
    # Merges the two dictionaries into one.
    feature_cols = continuous_cols.copy()
    feature_cols.update(categorical_cols)
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label
    

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

In [142]:
model_dir = tempfile.mkdtemp()

steps = 10000
e = tf.contrib.learn.LinearClassifier(model_dir=model_dir, feature_columns=[bean_type, company, cocoa_percent])
e.fit(input_fn=train_input_fn, steps=steps)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_master': '', '_save_checkpoints_steps': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f169d421358>, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_evaluation_master': '', '_task_type': None, '_model_dir': '/tmp/tmp58ettckp', '_num_ps_replicas': 0, '_save_checkpoints_secs': 600, '_environment': 'local', '_keep_checkpoint_max': 5, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_is_chief': True, '_num_worker_replicas': 0, '_log_step_count_steps': 100, '_task_id': 0}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp58ettckp/model.ckpt.
INFO:tensorflow:loss = 0.6931462, step = 1
INFO:tensorflow:global_step/sec: 659.20

INFO:tensorflow:global_step/sec: 1460.63
INFO:tensorflow:loss = 0.2087123, step = 7301 (0.069 sec)
INFO:tensorflow:global_step/sec: 1478.97
INFO:tensorflow:loss = 0.20830975, step = 7401 (0.068 sec)
INFO:tensorflow:global_step/sec: 1546.72
INFO:tensorflow:loss = 0.20791562, step = 7501 (0.064 sec)
INFO:tensorflow:global_step/sec: 1580.12
INFO:tensorflow:loss = 0.2075296, step = 7601 (0.065 sec)
INFO:tensorflow:global_step/sec: 1292.5
INFO:tensorflow:loss = 0.20715152, step = 7701 (0.075 sec)
INFO:tensorflow:global_step/sec: 1226.22
INFO:tensorflow:loss = 0.20678113, step = 7801 (0.082 sec)
INFO:tensorflow:global_step/sec: 1071.82
INFO:tensorflow:loss = 0.20641825, step = 7901 (0.093 sec)
INFO:tensorflow:global_step/sec: 1362.82
INFO:tensorflow:loss = 0.20606261, step = 8001 (0.073 sec)
INFO:tensorflow:global_step/sec: 1594.86
INFO:tensorflow:loss = 0.20571406, step = 8101 (0.063 sec)
INFO:tensorflow:global_step/sec: 1435.99
INFO:tensorflow:loss = 0.20537238, step = 8201 (0.070 sec)
INF

LinearClassifier(params={'gradient_clip_norm': None, 'optimizer': None, 'feature_columns': [_HashedCategoricalColumn(key='BeanType', hash_bucket_size=1000, dtype=tf.string), _HashedCategoricalColumn(key='Company', hash_bucket_size=1000, dtype=tf.string), _RealValuedColumn(column_name='CocoaPercent', dimension=1, default_value=None, dtype=tf.float32, normalizer=None)], 'joint_weights': False, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x7f169d4212b0>})

In [143]:
results = e.evaluate(input_fn=eval_input_fn, steps=1)

INFO:tensorflow:Starting evaluation at 2018-04-01-17:04:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp58ettckp/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-04-01-17:04:59
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.89415044, accuracy/baseline_label_mean = 0.11142062, accuracy/threshold_0.500000_mean = 0.89415044, auc = 0.7555643, auc_precision_recall = 0.29881096, global_step = 10000, labels/actual_label_mean = 0.11142062, labels/prediction_mean = 0.10320163, loss = 0.3141799, precision/positive_threshold_0.500000_mean = 0.6, recall/positive_threshold_0.500000_mean = 0.15


In [137]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

accuracy: 0.88579386
accuracy/baseline_label_mean: 0.11977716
accuracy/threshold_0.500000_mean: 0.88579386
auc: 0.6007507
auc_precision_recall: 0.23996232
global_step: 10000
labels/actual_label_mean: 0.11977716
labels/prediction_mean: 0.106004186
loss: 0.35910296
precision/positive_threshold_0.500000_mean: 1.0
recall/positive_threshold_0.500000_mean: 0.046511628


In [54]:
import tensorboard as tb

In [121]:
e.get_variable_value('BeanType')

NotFoundError: Key BeanType not found in checkpoint

In [62]:
results.items

<function dict.items>