In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import time
import itertools

tf.logging.set_verbosity(tf.logging.INFO)

In [13]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 7000
BIN_GRANULARITY     = 100
HIDDEN_LAYER_1_SIZE = 256
HIDDEN_LAYER_2_SIZE = 256
HIDDEN_LAYER_3_SIZE = 16
lr                  = 1e-3

In [3]:
def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)

def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

Take a look at the following link:
https://www.kaggle.com/mmmarcy/tensorflow-dnn-regressor-with-feature-engineering

# Data gathering 

In [4]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

data      = data.dropna(axis = 0)

test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 

min_price = min(all_data['price'])

ntrain = data.shape[0]
ntest  = test_data.shape[0]

BUCKET_LATI = BIN_GRANULARITY
BUCKET_LONG = BIN_GRANULARITY

min_source_lat  = min(all_data['sourceLatitude'])
min_destin_lat  = min(all_data['destinationLatitude'])
min_lat         = min(min_destin_lat, min_source_lat)

min_source_long = min(all_data['sourceLongitude'])
min_destin_long = min(all_data['destinationLongitude'])
min_long        = min(min_destin_long, min_source_long)

max_source_lat  = max(all_data['sourceLatitude'])
max_destin_lat  = max(all_data['destinationLatitude'])
max_lat         = max(max_destin_lat, max_source_lat)

max_source_long = max(all_data['sourceLongitude'])
max_destin_long = max(all_data['destinationLongitude'])
max_long        = max(max_destin_long, max_source_long)

d_lati = (max_lat - min_lat)/BUCKET_LATI
d_long = (max_long - min_long)/BUCKET_LONG

destin_lati_bucket = (all_data['destinationLatitude']  // d_lati).as_matrix().astype(int)
destin_long_bucket = (all_data['destinationLongitude'] // d_long).as_matrix().astype(int)

all_data['destination_tuple'] = tuple(zip(destin_lati_bucket,destin_long_bucket))

source_lati_bucket = (all_data['sourceLatitude']  // d_lati).as_matrix().astype(int)
source_long_bucket = (all_data['sourceLongitude'] // d_long).as_matrix().astype(int)

all_data['source_tuple'] = tuple(zip(source_lati_bucket,source_long_bucket))

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 
                    'vehicleOption', 'source_tuple', 'destination_tuple']

# The following two new features are required to achive the best current model 
all_data['source']      = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination'] = all_data['destinationLatitude']*all_data['destinationLongitude'] 

#continues_vars   = ['sourceLatitude', 'sourceLongitude', 'destinationLatitude', 'destinationLongitude',
#                    'distanceKM', 'taxiDurationMin', 'weight', 'source', 'destination']    
    
#for cont in continues_vars:
#    all_data[cont] = all_data[cont].astype('float32')

all_data = all_data.copy()
categorical_var_encoders = {}
for var in categorical_vars:
    le = preprocessing.LabelEncoder().fit(all_data[var])
    all_data[var + '_ids']  = le.transform(all_data[var])
    all_data[var + '_ids']  = all_data[var + '_ids'].astype(int)
    all_data.pop(var)
    categorical_var_encoders[var] = le
    
#all_data["distanceKM"]      = normalize_column(all_data["distanceKM"].values)
#all_data["taxiDurationMin"] = normalize_column(all_data["taxiDurationMin"].values)
#all_data["weight"]          = normalize_column(all_data["weight"].values)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

train['price']  = train['price'].astype('float32')

X_train, X_val, y_train, y_val = train_test_split(train, train['price'], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,date_ids,SourceState_ids,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids
23990,18426018643,32.669957,51.670529,1122.0,12500000.0,39.344206,45.064755,766.0,5.0,1773.037004,1688.073961,88,1,3,3,7,767,374
8729,68800274778,31.318349,48.681923,467.0,9000000.0,32.325773,50.847471,404.0,22.0,1643.683805,1524.637455,17,23,11,3,4,273,285
3451,80477779395,38.550434,44.95354,0.0,1000000.0,38.550434,44.95354,0.0,2.0,1732.978477,1732.978477,178,1,1,1,5,746,920
2628,28328675917,28.967032,50.843032,165.0,1775000.0,29.614637,51.65493,138.0,10.0,1529.742001,1472.771735,135,15,6,2,3,128,130
38351,14704494197,34.092157,49.693226,280.0,5880000.0,35.693633,51.407555,201.0,18.0,1834.922402,1694.149263,22,7,20,3,7,512,485


In [5]:
all_longitudes = set(all_data['sourceLongitude'].tolist() + all_data['destinationLongitude'].tolist())
all_latitude   = set(all_data['sourceLatitude'].tolist() + all_data['destinationLatitude'].tolist())

binned_long = np.linspace(min(all_longitudes), max(all_longitudes), BIN_GRANULARITY).tolist()
binned_lat  = np.linspace(min(all_latitude), max(all_latitude), BIN_GRANULARITY).tolist()

source_lat_feat         = tf.feature_column.numeric_column("sourceLatitude") 
source_long_feat        = tf.feature_column.numeric_column("sourceLongitude") 
destin_lat_feat         = tf.feature_column.numeric_column("destinationLatitude") 
destin_long_feat        = tf.feature_column.numeric_column("destinationLongitude") 

binned_source_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=source_lat_feat,
                              boundaries= binned_lat)
binned_source_long_feat = tf.feature_column.bucketized_column(
                              source_column=source_long_feat,
                              boundaries= binned_long)
binned_destin_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=destin_lat_feat,
                              boundaries= binned_lat)
binned_destin_long_feat = tf.feature_column.bucketized_column(
                              source_column=destin_long_feat,
                              boundaries= binned_long)

source_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_source_lat_feat, binned_source_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

destin_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_destin_lat_feat, binned_destin_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

distance_feat = tf.feature_column.numeric_column("distanceKM")
taximin_feat  = tf.feature_column.numeric_column("taxiDurationMin")
weight_feat   = tf.feature_column.numeric_column("weight")

date_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("date_ids", 186),8)

source_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("SourceState_ids", 31),5)

destin_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destinationState_ids", 31),5)

veh_type_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleType_ids", 4),2)

veh_option_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleOption_ids", 9),4)

source_feat   = tf.feature_column.numeric_column("source")
destin_feat   = tf.feature_column.numeric_column("destination")

destination_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destination_tuple_ids", 2191),11)
 
source_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("source_tuple_ids", 2191),11)

feature_columns = {source_lat_x_long, destin_lat_x_long, distance_feat, taximin_feat,
                   weight_feat, date_feat, source_state_feat, destin_state_feat,
                   veh_type_feat, veh_option_feat, source_feat, destin_feat,
                   destination_tuple_feat, source_tuple_feat}

#feature_columns = {distance_feat, taximin_feat, weight_feat}

In [6]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, 
                                                feature_columns=feature_columns)
    
    global_step = tf.train.get_or_create_global_step()

    x = tf.layers.dense(inputs=input_layer,
                        units=HIDDEN_LAYER_1_SIZE,
                        activation=tf.nn.relu,
                        name="first_fully_connected_layer")

    x = tf.layers.dropout(inputs=x,name="first_dropout")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_2_SIZE,
                        activation=tf.nn.relu,
                        name="second_fully_connected_layer")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_3_SIZE,
                        activation=tf.nn.relu,
                        name="third_fully_connected_layer")

    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)

    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss)
    else:
        #loss = tf.losses.absolute_difference(labels=labels,
        #                                    predictions=predictions)
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, 
                                      global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          predictions=predictions,
                                          loss=loss, 
                                          train_op=train_op)

In [7]:
def input_fn(df, pred = False):
        
    useful_fueatures = [
        np.array(df["sourceLatitude"].values, dtype=np.float32),
        np.array(df["sourceLongitude"].values, dtype=np.float32),
        np.array(df["destinationLatitude"].values, dtype=np.float32),
        np.array(df["destinationLongitude"].values, dtype=np.float32),
        np.array(df["distanceKM"].values, dtype=np.float32),
        np.array(df["taxiDurationMin"].values, dtype=np.float32),
        np.array(df["weight"].values, dtype=np.float32),
        np.array(df["date_ids"].values, dtype=np.int32),
        np.array(df["SourceState_ids"].values, dtype=np.int32),
        np.array(df["destinationState_ids"].values, dtype=np.int32),
        np.array(df["vehicleType_ids"].values, dtype=np.int32),
        np.array(df["vehicleOption_ids"].values, dtype=np.int32),
        np.array(df["source"].values, dtype=np.float32),
        np.array(df["destination"].values, dtype=np.float32),
        np.array(df["destination_tuple_ids"].values, dtype=np.int32),
        np.array(df["source_tuple_ids"].values, dtype=np.int32)
    ]

    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )
    
    sourceLatitude        = A[0]
    sourceLongitude       = A[1]
    destinationLatitude   = A[2]
    destinationLongitude  = A[3]
    distanceKM            = A[4]
    taxiDurationMin       = A[5] 
    weight                = A[6]
    date_ids              = A[7]
    SourceState_ids       = A[8]
    destinationState_ids  = A[9]
    vehicleType_ids       = A[10]
    vehicleOption_ids     = A[11]
    source                = A[12]
    destination           = A[13] 
    destination_tuple_ids = A[14] 
    source_tuple_ids      = A[15] 
    
    # Created a dict out of sliced input producers
    dataset_dict = dict(
        sourceLatitude=sourceLatitude,
        sourceLongitude=sourceLongitude,
        destinationLatitude=destinationLatitude,
        destinationLongitude=destinationLongitude, 
        distanceKM=distanceKM,
        taxiDurationMin=taxiDurationMin,
        weight=weight,
        date_ids=date_ids,
        SourceState_ids=SourceState_ids,
        destinationState_ids=destinationState_ids,
        vehicleType_ids=vehicleType_ids,
        vehicleOption_ids=vehicleOption_ids,
        source=source, 
        destination=destination,
        destination_tuple_ids=destination_tuple_ids,
        source_tuple_ids=source_tuple_ids,
    )

    if not pred:
        dataset_dict['labels'] = A[16]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [8]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp1fde86vk', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c2210cd68>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1072abf60>

In [10]:
ev = estimator_val.evaluate(input_fn=lambda: input_fn(X_val), steps=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-08-07:42:09
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp1fde86vk/model.ckpt-6000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-06-08-07:42:09
INFO:tensorflow:Saving dict for global step 6000: global_step = 6000, loss = 0.20563829


In [12]:
predictions_val  = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
preds_val        = [int(x) for x in predictions_val]
score            = mean_absolute_percentage_error(preds_val, y_val)
score

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp1fde86vk/model.ckpt-6000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


25.53934096988676

In [14]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator.train(input_fn=lambda: input_fn(train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpldih6ys4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a1f579278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

INFO:tensorflow:Loss for final step: 0.20151755.


<tensorflow.python.estimator.estimator.Estimator at 0x1a1f579128>

In [15]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test, pred=True)))
y_preds_test   = [int(x) for x in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpldih6ys4/model.ckpt-7000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# Save to file

In [16]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission18.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")