In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf 

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Loading augmented data

In [3]:
test  = pd.read_pickle('dataFrames/test_OneHotEncoding.pkl')
train = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn,y_dec,y_lgb
40689,10602550191,34.319566,47.078555,1172.0,19000000.0,27.474105,52.603738,859.0,22.0,1445.240621,...,0,0,0,0,20013150.0,20377512.0,20693972.0,4290000.0,18835250.0,22143660.0
28663,59022077023,35.699056,51.402792,911.0,8000000.0,29.617603,51.652078,661.0,5.35,1529.81074,...,0,0,0,0,5602654.0,7494018.5,4657211.0,6000000.0,4240625.0,7224867.0
19042,74752147720,27.182853,56.273862,980.0,10338000.0,32.801625,51.689466,667.0,23.61,1695.49848,...,0,0,0,0,9197673.0,9348608.0,9439628.0,8305000.0,9854376.0,10296610.0
21837,76223312658,32.673139,51.670482,200.0,2320000.0,34.136752,50.566116,142.0,11.3,1726.162961,...,0,0,0,0,2536652.0,2578700.75,2509670.0,2320000.0,2347938.0,3293571.0
35006,74609642925,35.699332,51.395552,935.0,18000000.0,30.435378,49.111278,659.0,22.0,1494.72031,...,0,0,0,0,17376430.0,17874336.0,17712615.0,16650000.0,18786820.0,16928930.0


# TensorFlow combination  

In [4]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 1200
HIDDEN_LAYER_1_SIZE = 512
HIDDEN_LAYER_2_SIZE = 512
HIDDEN_LAYER_3_SIZE = 16
lr                  = 1e-4

In [5]:
y_gboost_feat = tf.feature_column.numeric_column("y_gboost")
y_xgb_feat    = tf.feature_column.numeric_column("y_xgb")
y_bag_feat    = tf.feature_column.numeric_column("y_bag")
y_knn_feat    = tf.feature_column.numeric_column("y_knn")
y_dec_feat    = tf.feature_column.numeric_column("y_dec")
y_lgb_feat    = tf.feature_column.numeric_column("y_lgb")

feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_lgb_feat}

In [6]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, feature_columns=feature_columns)
    global_step = tf.train.get_or_create_global_step()
    x = tf.layers.dense(inputs=input_layer, units=HIDDEN_LAYER_1_SIZE, activation=tf.nn.relu, name="first_layer")
    x = tf.layers.dropout(inputs=x,name="first_dropout")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_2_SIZE, activation=tf.nn.relu, name="second_layer")
    x = tf.layers.dense(inputs=x, units=HIDDEN_LAYER_3_SIZE, activation=tf.nn.relu, name="third_layer")
    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)
    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss)
    else:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op)

In [7]:
def input_fn(df, pred = False):
    useful_fueatures = [np.array(df["y_gboost"].values, dtype=np.float32),
                        np.array(df["y_xgb"].values, dtype=np.float32),
                        np.array(df["y_bag"].values, dtype=np.float32),
                        np.array(df["y_lgb"].values, dtype=np.float32),
                        np.array(df["y_knn"].values, dtype=np.float32),
                        np.array(df["y_dec"].values, dtype=np.float32)]
    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(tensor_list=useful_fueatures, num_epochs=train_number, 
                                      shuffle= not pred,capacity=BATCH_SIZE * 5)
    
    y_gboost              = A[0]
    y_xgb                 = A[1]
    y_bag                 = A[2]
    y_lgb                 = A[3]
    y_knn                 = A[4]
    y_dec                 = A[5]
    
    dataset_dict = dict(
        y_gboost=y_gboost,
        y_xgb=y_xgb,
        y_bag=y_bag,
        y_lgb=y_lgb,
        y_knn=y_knn,
        y_dec=y_dec,
    )

    if not pred:
        dataset_dict['labels'] = A[6]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [8]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpq3x9563v', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c2ac6f710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1c2ac03e80>

In [9]:
predictions_val   = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
y_preds_val       = [int(x) for x in predictions_val]
mean_absolute_precision_error(y_preds_val, X_val.price)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpq3x9563v/model.ckpt-1200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


100.0

In [10]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator.train(input_fn=lambda: input_fn(train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpxjex5670', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a22026cc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1a22026c88>

In [11]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test, pred=True)))
y_preds_test   = [int(x) for x in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpxjex5670/model.ckpt-1200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# Save to File

In [15]:
#filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission36.csv"
#with open(filename,"w+") as outputfile:
#    outputfile.write("ID,price\n")
#    for i in range(len(y_preds_test)):
#        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")

##### Submission 19 with loss of 15.9 

In [13]:
#source_lat_feat         = tf.feature_column.numeric_column("sourceLatitude") 
#source_long_feat        = tf.feature_column.numeric_column("sourceLongitude") 
#destin_lat_feat         = tf.feature_column.numeric_column("destinationLatitude") 
#destin_long_feat        = tf.feature_column.numeric_column("destinationLongitude") 

#distance_feat = tf.feature_column.numeric_column("distanceKM")
#taximin_feat  = tf.feature_column.numeric_column("taxiDurationMin")
#weight_feat   = tf.feature_column.numeric_column("weight")

#source_feat   = tf.feature_column.numeric_column("source")
#destin_feat   = tf.feature_column.numeric_column("destination")

#feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_lgb_feat, y_knn_feat, y_dec_feat,
#                   source_lat_feat, source_long_feat , destin_lat_feat, destin_long_feat, 
#                   distance_feat, taximin_feat, weight_feat, source_feat, destin_feat}

#feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_lgb_feat, y_knn_feat, y_dec_feat}

In [14]:
#        np.array(df["sourceLatitude"].values, dtype=np.float32),
#        np.array(df["sourceLongitude"].values, dtype=np.float32),
#        np.array(df["destinationLatitude"].values, dtype=np.float32),
#        np.array(df["destinationLongitude"].values, dtype=np.float32),
#        np.array(df["distanceKM"].values, dtype=np.float32),
#        np.array(df["taxiDurationMin"].values, dtype=np.float32),
#        np.array(df["weight"].values, dtype=np.float32),
#        np.array(df["source"].values, dtype=np.float32),
#        np.array(df["destination"].values, dtype=np.float32),

#    sourceLatitude        = A[6]
#    sourceLongitude       = A[7]
#    destinationLatitude   = A[8]
#    destinationLongitude  = A[9]
#    distanceKM            = A[10]
#    taxiDurationMin       = A[11] 
#    weight                = A[12]
#    source                = A[13]
#    destination           = A[14] 

#        sourceLatitude=sourceLatitude,
#        sourceLongitude=sourceLongitude,
#        destinationLatitude=destinationLatitude,
#        destinationLongitude=destinationLongitude, 
#        distanceKM=distanceKM,
#        taxiDurationMin=taxiDurationMin,
#        weight=weight,
#        source=source, 
#        destination=destination