In [172]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import tempfile

In [173]:
df = pd.read_parquet("chocolate_ratings.parquet")
df.head()

Unnamed: 0,Company,SpecificBeanOriginOrBarName,REF,ReviewDate,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin
0,A. Morin,Agua Grande,1876,2016,0.63,France,3.75,unknown,Sao Tome
1,A. Morin,Kpime,1676,2015,0.7,France,2.75,unknown,Togo
2,A. Morin,Atsane,1676,2015,0.7,France,3.0,unknown,Togo
3,A. Morin,Akata,1680,2015,0.7,France,3.5,unknown,Togo
4,A. Morin,Quilla,1704,2015,0.7,France,3.5,unknown,Peru


In [174]:
df = df.drop(['REF'], axis=1)
df.head(10)

Unnamed: 0,Company,SpecificBeanOriginOrBarName,ReviewDate,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin
0,A. Morin,Agua Grande,2016,0.63,France,3.75,unknown,Sao Tome
1,A. Morin,Kpime,2015,0.7,France,2.75,unknown,Togo
2,A. Morin,Atsane,2015,0.7,France,3.0,unknown,Togo
3,A. Morin,Akata,2015,0.7,France,3.5,unknown,Togo
4,A. Morin,Quilla,2015,0.7,France,3.5,unknown,Peru
5,A. Morin,Carenero,2014,0.7,France,2.75,Criollo,Venezuela
6,A. Morin,Cuba,2014,0.7,France,3.5,unknown,Cuba
7,A. Morin,Sur del Lago,2014,0.7,France,3.5,Criollo,Venezuela
8,A. Morin,Puerto Cabello,2014,0.7,France,3.75,Criollo,Venezuela
9,A. Morin,Pablino,2014,0.7,France,4.0,unknown,Peru


In [213]:
min(df["ReviewDate"]), max(df['ReviewDate'])
df['ReviewDateFloat'] = [(float(x)-2006)/11.0 for x in df['ReviewDate']]
df.head()

Unnamed: 0,Company,SpecificBeanOriginOrBarName,ReviewDate,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin,ReviewDateFloat
0,A. Morin,Agua Grande,2016,0.63,France,3.75,unknown,Sao Tome,0.909091
1,A. Morin,Kpime,2015,0.7,France,2.75,unknown,Togo,0.818182
2,A. Morin,Atsane,2015,0.7,France,3.0,unknown,Togo,0.818182
3,A. Morin,Akata,2015,0.7,France,3.5,unknown,Togo,0.818182
4,A. Morin,Quilla,2015,0.7,France,3.5,unknown,Peru,0.818182


In [218]:
COLUMNS = ['CocoaPercent', 'Rating', 'BeanType', 'Company', 'SpecificBeanOriginOrBarName','BroadBeanOrigin','ReviewDateFloat']
LABEL_COLUMN = 'Rating'
CATEGORICAL_COLUMNS = ['BeanType', 'Company', 'BroadBeanOrigin','SpecificBeanOriginOrBarName']
CONTINUOUS_COLUMNS = ['CocoaPercent', 'ReviewDateFloat']

company = tf.feature_column.categorical_column_with_hash_bucket('Company', hash_bucket_size=1000)
bean_type = tf.feature_column.categorical_column_with_hash_bucket('BeanType', hash_bucket_size=1000)
broad_origin = tf.feature_column.categorical_column_with_hash_bucket('BroadBeanOrigin', hash_bucket_size=1000)
specific_origin = tf.feature_column.categorical_column_with_hash_bucket('SpecificBeanOriginOrBarName', hash_bucket_size=1000)
cocoa_percent = tf.contrib.layers.real_valued_column("CocoaPercent")
review_date = tf.contrib.layers.real_valued_column("ReviewDateFloat")
cocoa_buckets = tf.contrib.layers.bucketized_column(cocoa_percent, boundaries=[40, 70, 100])

# use pandas functions to split data into training and test sets
df_train = df.sample(frac=0.9, random_state=int(time.time()))

# df.index: row labels
df_test = df.drop(df_train.index)

def input_fn(df):
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        dense_shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS}
    
    # Merges the two dictionaries into one.
    feature_cols = continuous_cols.copy()
    feature_cols.update(categorical_cols)
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label
    

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

In [219]:
model_dir = tempfile.mkdtemp()

steps = 5000
e = tf.contrib.learn.LinearRegressor(model_dir=model_dir, feature_columns=[specific_origin, cocoa_percent,review_date])
e.fit(input_fn=train_input_fn, steps=steps)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_master': '', '_save_checkpoints_steps': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1680ddbb38>, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_evaluation_master': '', '_task_type': None, '_model_dir': '/tmp/tmpmb24zrj5', '_num_ps_replicas': 0, '_save_checkpoints_secs': 600, '_environment': 'local', '_keep_checkpoint_max': 5, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_is_chief': True, '_num_worker_replicas': 0, '_log_step_count_steps': 100, '_task_id': 0}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpmb24zrj5/model.ckpt.
INFO:tensorflow:loss = 10.374729, step = 1
INFO:tensorflow:global_step/sec: 484.91

LinearRegressor(params={'gradient_clip_norm': None, 'optimizer': None, 'feature_columns': [_HashedCategoricalColumn(key='SpecificBeanOriginOrBarName', hash_bucket_size=1000, dtype=tf.string), _RealValuedColumn(column_name='CocoaPercent', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='ReviewDateFloat', dimension=1, default_value=None, dtype=tf.float32, normalizer=None)], 'joint_weights': False, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._RegressionHead object at 0x7f1680ddb9b0>})

In [220]:
results = e.evaluate(input_fn=eval_input_fn, steps=10)

INFO:tensorflow:Starting evaluation at 2018-04-01-19:50:17
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpmb24zrj5/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/10]
INFO:tensorflow:Evaluation [2/10]
INFO:tensorflow:Evaluation [3/10]
INFO:tensorflow:Evaluation [4/10]
INFO:tensorflow:Evaluation [5/10]
INFO:tensorflow:Evaluation [6/10]
INFO:tensorflow:Evaluation [7/10]
INFO:tensorflow:Evaluation [8/10]
INFO:tensorflow:Evaluation [9/10]
INFO:tensorflow:Evaluation [10/10]
INFO:tensorflow:Finished evaluation at 2018-04-01-19:50:17
INFO:tensorflow:Saving dict for global step 5000: global_step = 5000, loss = 0.29960418


In [222]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

global_step: 5000
loss: 0.29960418


# Questions

1. How to visualize the parameters and the model? "e.get_variable_names" / "e.get_variable_value" / e.get_params gives access to that but how can print it out?
2. How to look at predictions?

In [62]:
results.items

<function dict.items>