In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import tensorflow as tf 
tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Loading augmented data

In [3]:
# Optimal which_model(100, number =30, batch_size=128, l1=512, l2=128, l3=16, lr=1e-4, step_log=10)

In [4]:
#test  = pd.read_pickle('dataFrames/test_OneHotEncoding.pkl')
#train = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
test   = pd.read_pickle('dataFrames/test_OneHotEncoding_new_June14th.pkl')
train  = pd.read_pickle('dataFrames/train_OneHotEncoding_new_June14th.pkl')

continuous_cols = ['destinationLatitude', 'destinationLongitude', 'distanceKM', 'sourceLatitude', 
                   'sourceLongitude', 'taxiDurationMin', 'weight', 'source', 'destination', 'y_avg_lgb_xgb',
                   'y_gboost', 'y_xgb', 'y_bag', 'y_knn', 'y_dec', 'y_lgb' ]

continuous_cols = ['y_avg_lgb_xgb', 'y_gboost', 'y_xgb', 'y_lgb']

categorical_cols = train.columns.drop(continuous_cols + ['ID', 'price']).tolist()

NOM = train[categorical_cols].shape[1]
renaming_dict = dict(zip(train[categorical_cols].columns, [str(x) for x in list(range(NOM)) ]))

train_renamed = train[categorical_cols].rename(columns=renaming_dict)
test_renamed  = test[categorical_cols].rename(columns=renaming_dict)

for column in continuous_cols:
    train_renamed[column] = train[column]
    test_renamed[column] = test[column]
    
test_renamed['ID']   = test['ID']
train_renamed['ID'] = train['ID']
test_renamed['price'] = test['price']
train_renamed['price'] = train['price']

X_train, X_val = train_test_split(train_renamed, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,y_avg_lgb_xgb,y_gboost,y_xgb,y_lgb,ID,price
6533,38.075775,46.289152,0.0,38.075775,46.289152,0.0,2.0,1762.495336,1762.495336,1,...,0,691299.5,650000.0,978516.7,748229.6,691645.0,753880.6,702605.9,93320733747,650000.0
5555,36.293682,59.603236,1150.0,28.666628,57.740398,805.0,10.31,1655.22251,2163.220894,0,...,0,13257631.0,8300000.0,14394380.0,16223610.0,14231860.0,12243700.0,15073810.0,33053951098,16533000.0
49803,31.887975,54.364762,365.0,30.246658,57.067322,245.0,19.0,1726.095772,1733.582172,0,...,0,5951497.0,2000000.0,6354838.0,6009568.0,5262343.0,6206480.0,6076392.0,86943006373,4000000.0
7053,35.705444,51.398015,905.0,35.272156,59.215521,632.0,2.0,2088.659094,1835.188946,0,...,0,3199928.0,3400000.0,3281494.0,3504943.0,3413185.0,3405081.0,3381870.0,55089031153,4000000.0
10115,31.320085,48.680376,797.0,36.142911,49.22036,611.0,20.0,1778.967091,1524.673514,0,...,0,11256109.0,8150000.0,10270950.0,10810260.0,10916210.0,10827560.0,11049260.0,39178389730,11172000.0


# TensorFlow combination  

In [5]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 500
HIDDEN_LAYER_1_SIZE = 512
HIDDEN_LAYER_2_SIZE = 512
HIDDEN_LAYER_3_SIZE = 16
lr                  = 5e-5
USE_ALL_FEATURES    = False 

In [6]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, feature_columns=feature_columns)
    global_step = tf.train.get_or_create_global_step()
    x = tf.layers.dense(inputs=input_layer, units=HIDDEN_LAYER_1_SIZE, activation=tf.nn.relu, name="first_layer")
    x = tf.layers.dropout(inputs=x,name="first_dropout")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_2_SIZE, activation=tf.nn.relu, name="second_layer")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_3_SIZE, activation=tf.nn.relu, name="third_layer")
    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)
    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss)
    else:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op)

In [7]:
feature_columns = set()

if USE_ALL_FEATURES:
    for col in categorical_cols:
        col_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(renaming_dict[col], 2),2)
        feature_columns.add(col_feat)

for cont in continuous_cols:
    col_feat = tf.feature_column.numeric_column(cont)
    feature_columns.add(col_feat)

In [8]:
def input_fn(df, pred = False, use_all_features = USE_ALL_FEATURES):
        
    useful_fueatures = list()
    
    if use_all_features:
        for col in categorical_cols:
            useful_fueatures.append(np.array(df[renaming_dict[col]].values, dtype=np.int32))

    for cont in continuous_cols:
        useful_fueatures.append(np.array(df[cont].values, dtype=np.float32))    
    
    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )

    dataset_dict = dict()
    
    if use_all_features:
        for i in range(len(A)):
            if i < len(categorical_cols):
                dataset_dict[renaming_dict[categorical_cols[i]]] = A[i]
            elif i < len(categorical_cols) + len(continuous_cols):
                dataset_dict[continuous_cols[i-len(categorical_cols)]] = A[i]
    else:
        for i in range(len(A)):
            if i < len(continuous_cols):
                dataset_dict[continuous_cols[i]] = A[i]
            
    if not pred:
        dataset_dict['labels'] = A[-1]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [10]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
rconfig = tf.estimator.RunConfig(log_step_count_steps = 10)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams, config = rconfig)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpfo6992_d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 10, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10b4f0fd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpfo6

<tensorflow.python.estimator.estimator.Estimator at 0x1c2a743358>

In [11]:
predictions_val   = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
y_preds_val       = [int(x) for x in predictions_val]
mean_absolute_precision_error(y_preds_val, X_val.price)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpfo6992_d/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


16.27686666709461

In [12]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams, config = rconfig)
estimator.train(input_fn=lambda: input_fn(train_renamed), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmptslf4za5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 10, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10b4f0dd8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmptsl

<tensorflow.python.estimator.estimator.Estimator at 0x10b4f0cc0>

In [13]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test_renamed, pred=True)))
y_preds_test   = [int(x) for x in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmptslf4za5/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# Save to File

In [14]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission43.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")