In [1]:
import pandas as pn
import tempfile
import tensorflow as tf
from sklearn.cross_validation import train_test_split

## Loading Pandas DF with the data


In [2]:
df = pn.read_pickle('/home/leopoloco/Documents/INTELIMETRICA/df_clean2_femsa')
df.info()
df.time = df.time.astype('string')
len(df.stop.unique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8649 entries, 40 to 45806
Data columns (total 18 columns):
client_id               8649 non-null object
client_role_des         8649 non-null object
id_movil                8649 non-null object
stop                    8649 non-null object
stop_time               8649 non-null float64
order_cfs               8649 non-null float64
delivered               8649 non-null float64
fechahora               8649 non-null datetime64[ns]
client_distance         8649 non-null float64
total_distance          8649 non-null float64
head_count              8649 non-null int64
stop_clients            8649 non-null int64
op_travel_time          8649 non-null float64
fecha                   8649 non-null object
time                    8649 non-null object
op_total_travel_time    8649 non-null float64
slope                   8649 non-null float64
ATPC                    8649 non-null float64
dtypes: datetime64[ns](1), float64(9), int64(2), object(6)
memory 

62

### Using the train_test_split function to randomly create our train and test subsets.

In [3]:
X = df[['client_id', 'client_role_des', 'id_movil', 'stop', 'stop_time', 'order_cfs', 'delivered', 'fecha', 'time', 'client_distance', 'total_distance', 'head_count', 'stop_clients', 'op_travel_time', 'op_total_travel_time']]
y = df[['ATPC']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

#### Defining the categorical and continuous columns

In [4]:
CATEGORICAL_COLUMNS = ['client_id', 'client_role_des', 'id_movil', 'stop', 'fecha', 'time']
CONTINUOUS_COLUMNS = ['stop_time', 'order_cfs', 'delivered', 'client_distance', 'total_distance', 'head_count', 'stop_clients', 'op_travel_time', 'op_total_travel_time']
LABEL_COLUMN = 'ATPC'

#### Converting data into Tensors functions

In [5]:
def input_fn(df):
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  # Converts the label column into a constant Tensor.
  label = tf.constant(df[LABEL_COLUMN].values)
  # Returns the feature columns and the label.
  return feature_cols, label

def train_input_fn():
  return input_fn(df_train)

def eval_input_fn():
  return input_fn(df_test)

#### Basing categorical feature columns

In [6]:
client_id = tf.contrib.layers.sparse_column_with_hash_bucket("client_id", hash_bucket_size=5000)
client_role_des = tf.contrib.layers.sparse_column_with_hash_bucket("client_role_des", hash_bucket_size=100)
id_movil = tf.contrib.layers.sparse_column_with_hash_bucket("id_movil", hash_bucket_size=100)
stop = tf.contrib.layers.sparse_column_with_hash_bucket("stop", hash_bucket_size=100)
fecha = tf.contrib.layers.sparse_column_with_hash_bucket("fecha", hash_bucket_size=100)
time = tf.contrib.layers.sparse_column_with_hash_bucket("time", hash_bucket_size=5000)

#### Basing Continuous Feature Columns

In [7]:
stop_time = tf.contrib.layers.real_valued_column("stop_time")
order_cfs = tf.contrib.layers.real_valued_column("order_cfs")
delivered = tf.contrib.layers.real_valued_column("delivered")
client_distance = tf.contrib.layers.real_valued_column("client_distance")
total_distance = tf.contrib.layers.real_valued_column("total_distance")
head_count = tf.contrib.layers.real_valued_column("head_count")
stop_clients = tf.contrib.layers.real_valued_column("stop_clients")
op_travel_time = tf.contrib.layers.real_valued_column("op_travel_time")
op_total_travel_time = tf.contrib.layers.real_valued_column("op_total_travel_time")

#### Continuous features bucketization

In [8]:
orders_buckets = tf.contrib.layers.bucketized_column(order_cfs, boundaries=[2, 4, 6, 8, 10, 12, 24, 36])

#### Defining crossed columns

In [9]:
client_role_x_order = tf.contrib.layers.crossed_column([client_role_des, orders_buckets], hash_bucket_size=int(1e4))

In [10]:
id_movil_x_client_role_x_order = tf.contrib.layers.crossed_column([id_movil, client_role_x_order], hash_bucket_size=int(1e6))

#### Defining the linear regressor model

In [11]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearRegressor(feature_columns=[client_id, client_role_des, id_movil, stop, fecha, time, stop_time, order_cfs, delivered, client_distance, total_distance, head_count, stop_clients, op_travel_time, op_total_travel_time, orders_buckets, client_role_x_order, id_movil_x_client_role_x_order],  optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,
    l1_regularization_strength=1.0,
    l2_regularization_strength=1.0), model_dir=model_dir)

#### Training the model

In [12]:
m.fit(input_fn=train_input_fn, steps=200)



LinearRegressor()

In [13]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



global_step: 200
loss: 97.9187


In [14]:
results

{'global_step': 200, 'loss': 97.918739}