In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import tensorflow as tf 
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Loading augmented data

In [10]:
#test  = pd.read_pickle('dataFrames/test_OneHotEncoding.pkl')
#train = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
test   = pd.read_pickle('dataFrames/test_OneHotEncoding_new_June14th.pkl')
train  = pd.read_pickle('dataFrames/train_OneHotEncoding_new_June14th.pkl')


continuous_cols = ['destinationLatitude', 'destinationLongitude', 'distanceKM', 'sourceLatitude', 
                   'sourceLongitude', 'taxiDurationMin', 'weight', 'source', 'destination', 
                   'y_gboost', 'y_xgb', 'y_bag', 'y_knn', 'y_dec', 'y_lgb' ]
categorical_cols = train.columns.drop(continuous_cols + ['ID', 'price']).tolist()

NOM = train[categorical_cols].shape[1]
renaming_dict = dict(zip(train[categorical_cols].columns, [str(x) for x in list(range(NOM)) ]))

train_renamed = train[categorical_cols].rename(columns=renaming_dict)
test_renamed  = test[categorical_cols].rename(columns=renaming_dict)

for column in continuous_cols:
    train_renamed[column] = train[column]
    test_renamed[column] = test[column]
    
test_renamed['ID']   = test['ID']
train_renamed['ID'] = train['ID']
test_renamed['price'] = test['price']
train_renamed['price'] = train['price']

X_train, X_val = train_test_split(train_renamed, test_size=0.2, random_state=42)

# TensorFlow combination  

In [4]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 400
HIDDEN_LAYER_1_SIZE = 64
HIDDEN_LAYER_2_SIZE = 64
HIDDEN_LAYER_3_SIZE = 16
lr                  = 5e-5
USE_ALL_FEATURES    = False 

In [5]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, feature_columns=feature_columns)
    global_step = tf.train.get_or_create_global_step()
    x = tf.layers.dense(inputs=input_layer, units=HIDDEN_LAYER_1_SIZE, activation=tf.nn.relu, name="first_layer")
    x = tf.layers.dropout(inputs=x,name="first_dropout")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_2_SIZE, activation=tf.nn.relu, name="second_layer")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_3_SIZE, activation=tf.nn.relu, name="third_layer")
    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)
    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss)
    else:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op)

In [6]:
feature_columns = set()

if USE_ALL_FEATURES:
    for col in categorical_cols:
        col_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(renaming_dict[col], 2),2)
        feature_columns.add(col_feat)

for cont in continuous_cols:
    col_feat = tf.feature_column.numeric_column(cont)
    feature_columns.add(col_feat)

In [7]:
def input_fn(df, pred = False, use_all_features = USE_ALL_FEATURES):
        
    useful_fueatures = list()
    
    if use_all_features:
        for col in categorical_cols:
            useful_fueatures.append(np.array(df[renaming_dict[col]].values, dtype=np.int32))

    for cont in continuous_cols:
        useful_fueatures.append(np.array(df[cont].values, dtype=np.float32))    
    
    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )

    dataset_dict = dict()
    
    if use_all_features:
        for i in range(len(A)):
            if i < len(categorical_cols):
                dataset_dict[renaming_dict[categorical_cols[i]]] = A[i]
            elif i < len(categorical_cols) + len(continuous_cols):
                dataset_dict[continuous_cols[i-len(categorical_cols)]] = A[i]
    else:
        for i in range(len(A)):
            if i < len(continuous_cols):
                dataset_dict[continuous_cols[i]] = A[i]
            
    if not pred:
        dataset_dict['labels'] = A[-1]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [8]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
rconfig = tf.estimator.RunConfig(log_step_count_steps = 10)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams, config = rconfig)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

<tensorflow.python.estimator.estimator.Estimator at 0x1a21ee0a90>

In [9]:
predictions_val   = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
y_preds_val       = [int(x) for x in predictions_val]
mean_absolute_precision_error(y_preds_val, X_val.price)

17.215470338632542

In [None]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams, config = rconfig)
estimator.train(input_fn=lambda: input_fn(train_renamed), steps=400)

In [None]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test_renamed, pred=True)))
y_preds_test   = [int(x) for x in predictions]

# Save to File

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission39.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")