In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf 

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Loading augmented data

In [3]:
test  = pd.read_pickle('dataFrames/test_OneHotEncoding.pkl')
train = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')

continuous_cols = ['destinationLatitude', 'destinationLongitude', 'distanceKM', 'sourceLatitude', 
                   'sourceLongitude', 'taxiDurationMin', 'weight', 'source', 'destination', 
                   'y_gboost', 'y_xgb', 'y_bag', 'y_knn', 'y_dec', 'y_lgb' ]
categorical_cols = train.columns.drop(continuous_cols + ['ID', 'price']).tolist()

NOM = train[categorical_cols].shape[1]
renaming_dict = dict(zip(train[categorical_cols].columns, [str(x) for x in list(range(NOM)) ]))

train_renamed = train[categorical_cols].rename(columns=renaming_dict)
test_renamed  = test[categorical_cols].rename(columns=renaming_dict)

for column in continuous_cols:
    train_renamed[column] = train[column]
    test_renamed[column] = test[column]
    
test_renamed['ID']   = test['ID']
train_renamed['ID'] = train['ID']
test_renamed['price'] = test['price']
train_renamed['price'] = train['price']

X_train, X_val = train_test_split(train_renamed, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,source,destination,y_gboost,y_xgb,y_bag,y_knn,y_dec,y_lgb,ID,price
40689,0,0,0,0,0,0,1,0,0,0,...,1445.240621,1615.715576,20013150.0,20377512.0,20693972.0,4290000.0,18835250.0,22143660.0,10602550191,19000000.0
28663,0,0,0,0,0,0,0,0,0,0,...,1529.81074,1835.03115,5602654.0,7494018.5,4657211.0,6000000.0,4240625.0,7224867.0,59022077023,8000000.0
19042,0,0,0,1,0,0,0,0,0,0,...,1695.49848,1529.684118,9197673.0,9348608.0,9439628.0,8305000.0,9854376.0,10296610.0,74752147720,10338000.0
21837,0,0,0,0,0,0,0,0,0,0,...,1726.162961,1688.236841,2536652.0,2578700.75,2509670.0,2320000.0,2347938.0,3293571.0,76223312658,2320000.0
35006,0,0,0,0,0,0,0,0,0,0,...,1494.72031,1834.786874,17376430.0,17874336.0,17712615.0,16650000.0,18786820.0,16928930.0,74609642925,18000000.0


In [4]:
X_train.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
       '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72',
       '73', '74', 'destinationLatitude', 'destinationLongitude', 'distanceKM',
       'sourceLatitude', 'sourceLongitude', 'taxiDurationMin', 'weight',
       'source', 'destination', 'y_gboost', 'y_xgb', 'y_bag', 'y_knn', 'y_dec',
       'y_lgb', 'ID', 'price'],
      dtype='object')

# TensorFlow combination  

In [5]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 1200

HIDDEN_LAYER_1_SIZE = 512
HIDDEN_LAYER_2_SIZE = 512
HIDDEN_LAYER_3_SIZE = 128
lr                  = 1e-4

In [37]:
feature_columns = set()

#for col in categorical_cols:
#    col_feat = tf.feature_column.embedding_column(
#        tf.feature_column.categorical_column_with_identity(renaming_dict[col], 1),2)
#    feature_columns.add(col_feat)

for cont in continuous_cols:
    col_feat = tf.feature_column.numeric_column(cont)
    feature_columns.add(col_feat)

In [38]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, feature_columns=feature_columns)
    global_step = tf.train.get_or_create_global_step()
    x = tf.layers.dense(inputs=input_layer, units=HIDDEN_LAYER_1_SIZE, activation=tf.nn.relu, name="first_layer")
    x = tf.layers.dropout(inputs=x,name="first_dropout")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_2_SIZE, activation=tf.nn.relu, name="second_layer")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_3_SIZE, activation=tf.nn.relu, name="third_layer")
    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)
    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss)
    else:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op)

In [39]:
def input_fn(df, pred = False):
        
    useful_fueatures = list()
    for col in categorical_cols:
        useful_fueatures.append(np.array(df[renaming_dict[col]].values, dtype=np.int32))

    for cont in continuous_cols:
        useful_fueatures.append(np.array(df[cont].values, dtype=np.float32))    
    
    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )

    dataset_dict = dict()
    for i in range(len(A)):
        if i < len(categorical_cols):
            #dataset_dict[renaming_dict[categorical_cols[i]]] = A[i]
            pass
        elif i < len(categorical_cols) + len(continuous_cols):
            dataset_dict[continuous_cols[i-len(categorical_cols)]] = A[i]

    if not pred:
        dataset_dict['labels'] = A[-1]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        print(batch_dict)
        return batch_dict 

In [40]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpzmjwse4w', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c2c219e48>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1c2c219cc0>

In [41]:
predictions_val   = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
y_preds_val       = [int(x) for x in predictions_val]
mean_absolute_precision_error(y_preds_val, X_val.price)

{'destination': <tf.Tensor 'batch:0' shape=(1,) dtype=float32>, 'destinationLatitude': <tf.Tensor 'batch:1' shape=(1,) dtype=float32>, 'destinationLongitude': <tf.Tensor 'batch:2' shape=(1,) dtype=float32>, 'distanceKM': <tf.Tensor 'batch:3' shape=(1,) dtype=float32>, 'source': <tf.Tensor 'batch:4' shape=(1,) dtype=float32>, 'sourceLatitude': <tf.Tensor 'batch:5' shape=(1,) dtype=float32>, 'sourceLongitude': <tf.Tensor 'batch:6' shape=(1,) dtype=float32>, 'taxiDurationMin': <tf.Tensor 'batch:7' shape=(1,) dtype=float32>, 'weight': <tf.Tensor 'batch:8' shape=(1,) dtype=float32>, 'y_bag': <tf.Tensor 'batch:9' shape=(1,) dtype=float32>, 'y_dec': <tf.Tensor 'batch:10' shape=(1,) dtype=float32>, 'y_gboost': <tf.Tensor 'batch:11' shape=(1,) dtype=float32>, 'y_knn': <tf.Tensor 'batch:12' shape=(1,) dtype=float32>, 'y_lgb': <tf.Tensor 'batch:13' shape=(1,) dtype=float32>, 'y_xgb': <tf.Tensor 'batch:14' shape=(1,) dtype=float32>}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling mo

16.510907101630124

In [43]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator.train(input_fn=lambda: input_fn(train_renamed), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpdcis1vro', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c29d169e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1c29d16908>

In [44]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test_renamed, pred=True)))
y_preds_test   = [int(x) for x in predictions]

{'destination': <tf.Tensor 'batch:0' shape=(1,) dtype=float32>, 'destinationLatitude': <tf.Tensor 'batch:1' shape=(1,) dtype=float32>, 'destinationLongitude': <tf.Tensor 'batch:2' shape=(1,) dtype=float32>, 'distanceKM': <tf.Tensor 'batch:3' shape=(1,) dtype=float32>, 'source': <tf.Tensor 'batch:4' shape=(1,) dtype=float32>, 'sourceLatitude': <tf.Tensor 'batch:5' shape=(1,) dtype=float32>, 'sourceLongitude': <tf.Tensor 'batch:6' shape=(1,) dtype=float32>, 'taxiDurationMin': <tf.Tensor 'batch:7' shape=(1,) dtype=float32>, 'weight': <tf.Tensor 'batch:8' shape=(1,) dtype=float32>, 'y_bag': <tf.Tensor 'batch:9' shape=(1,) dtype=float32>, 'y_dec': <tf.Tensor 'batch:10' shape=(1,) dtype=float32>, 'y_gboost': <tf.Tensor 'batch:11' shape=(1,) dtype=float32>, 'y_knn': <tf.Tensor 'batch:12' shape=(1,) dtype=float32>, 'y_lgb': <tf.Tensor 'batch:13' shape=(1,) dtype=float32>, 'y_xgb': <tf.Tensor 'batch:14' shape=(1,) dtype=float32>}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling mo

# Save to File

In [45]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission37.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")

##### Submission 19 with loss of 15.9 

In [None]:
    y_gboost              = A[0]
    y_xgb                 = A[1]
    y_bag                 = A[2]
    y_lgb                 = A[3]
    y_knn                 = A[4]
    y_dec                 = A[5]
    sourceLatitude        = A[6]
    sourceLongitude       = A[7]
    destinationLatitude   = A[8]
    destinationLongitude  = A[9]
    distanceKM            = A[10]
    taxiDurationMin       = A[11] 
    weight                = A[12]
    source                = A[13]
    destination           = A[14] 
    
    
    # Created a dict out of sliced input producers
    dataset_dict = dict(
        y_gboost=y_gboost,
        y_xgb=y_xgb,
        y_bag=y_bag,
        y_lgb=y_lgb,
        y_knn=y_knn,
        y_dec=y_dec,
        sourceLatitude=sourceLatitude,
        sourceLongitude=sourceLongitude,
        destinationLatitude=destinationLatitude,
        destinationLongitude=destinationLongitude, 
        distanceKM=distanceKM,
        taxiDurationMin=taxiDurationMin,
        weight=weight,
        source=source, 
        destination=destination
    )

In [None]:
    
    
    [
        np.array(df["y_gboost"].values, dtype=np.float32),
        np.array(df["y_xgb"].values, dtype=np.float32),
        np.array(df["y_bag"].values, dtype=np.float32),
        np.array(df["y_lgb"].values, dtype=np.float32),
        np.array(df["y_knn"].values, dtype=np.float32),
        np.array(df["y_dec"].values, dtype=np.float32),
        np.array(df["sourceLatitude"].values, dtype=np.float32),
        np.array(df["sourceLongitude"].values, dtype=np.float32),
        np.array(df["destinationLatitude"].values, dtype=np.float32),
        np.array(df["destinationLongitude"].values, dtype=np.float32),
        np.array(df["distanceKM"].values, dtype=np.float32),
        np.array(df["taxiDurationMin"].values, dtype=np.float32),
        np.array(df["weight"].values, dtype=np.float32),
        np.array(df["source"].values, dtype=np.float32),
        np.array(df["destination"].values, dtype=np.float32),
    ]

In [None]:
y_gboost_feat = tf.feature_column.numeric_column("y_gboost")
y_xgb_feat    = tf.feature_column.numeric_column("y_xgb")
y_bag_feat    = tf.feature_column.numeric_column("y_bag")
y_knn_feat    = tf.feature_column.numeric_column("y_knn")
y_dec_feat    = tf.feature_column.numeric_column("y_dec")
y_lgb_feat    = tf.feature_column.numeric_column("y_lgb")

source_lat_feat         = tf.feature_column.numeric_column("sourceLatitude") 
source_long_feat        = tf.feature_column.numeric_column("sourceLongitude") 
destin_lat_feat         = tf.feature_column.numeric_column("destinationLatitude") 
destin_long_feat        = tf.feature_column.numeric_column("destinationLongitude") 

distance_feat = tf.feature_column.numeric_column("distanceKM")
taximin_feat  = tf.feature_column.numeric_column("taxiDurationMin")
weight_feat   = tf.feature_column.numeric_column("weight")

source_feat   = tf.feature_column.numeric_column("source")
destin_feat   = tf.feature_column.numeric_column("destination")

feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_lgb_feat, y_knn_feat, y_dec_feat,
                   source_lat_feat, source_long_feat , destin_lat_feat, destin_long_feat, 
                   distance_feat, taximin_feat, weight_feat, source_feat, destin_feat}