In [11]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
import xgboost as xgb
import tensorflow as tf 

In [68]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 3000
BIN_GRANULARITY     = 10
HIDDEN_LAYER_1_SIZE = 16
HIDDEN_LAYER_2_SIZE = 16
HIDDEN_LAYER_3_SIZE = 16
lr                  = 1e-3

In [69]:
def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)

def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

def mean_absolute_precision_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Data gathering

In [70]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')
data      = data.dropna(axis = 0)
test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380
all_data = pd.concat((data, test_data)) 
min_price = min(all_data['price'])
ntrain = data.shape[0]
ntest  = test_data.shape[0]
BUCKET_LATI = 1000
BUCKET_LONG = 1000

min_source_lat  = min(all_data['sourceLatitude'])
min_destin_lat  = min(all_data['destinationLatitude'])
min_lat         = min(min_destin_lat, min_source_lat)
min_source_long = min(all_data['sourceLongitude'])
min_destin_long = min(all_data['destinationLongitude'])
min_long        = min(min_destin_long, min_source_long)
max_source_lat  = max(all_data['sourceLatitude'])
max_destin_lat  = max(all_data['destinationLatitude'])
max_lat         = max(max_destin_lat, max_source_lat)
max_source_long = max(all_data['sourceLongitude'])
max_destin_long = max(all_data['destinationLongitude'])
max_long        = max(max_destin_long, max_source_long)
d_lati          = (max_lat - min_lat)/BUCKET_LATI
d_long          = (max_long - min_long)/BUCKET_LONG
destin_lati_bucket = (all_data['destinationLatitude']  // d_lati).as_matrix().astype(int)
destin_long_bucket = (all_data['destinationLongitude'] // d_long).as_matrix().astype(int)
source_lati_bucket = (all_data['sourceLatitude']  // d_lati).as_matrix().astype(int)
source_long_bucket = (all_data['sourceLongitude'] // d_long).as_matrix().astype(int)

all_data['destination_tuple'] = tuple(zip(destin_lati_bucket,destin_long_bucket))
all_data['source_tuple'] = tuple(zip(source_lati_bucket,source_long_bucket))

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 
                    'vehicleOption', 'source_tuple', 'destination_tuple']

all_data = all_data.copy()
categorical_var_encoders = {}
for var in categorical_vars:
    le = preprocessing.LabelEncoder().fit(all_data[var])
    all_data[var + '_ids']  = le.transform(all_data[var])
    all_data[var + '_ids']  = all_data[var + '_ids'].astype('int32')
    all_data.pop(var)
    categorical_var_encoders[var] = le

all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

train    = all_data[:ntrain]
test     = all_data[ntrain:]

train_1, train_2 = train_test_split(train, test_size=0.5)

X = train.drop(['ID','price'],axis=1)
y = train.price

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,date_ids,SourceState_ids,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids,source,destination
39085,36.473089,52.349822,184.0,35.700109,51.399743,199.0,21.0,124,7,19,3,2,1410,1774,1834.976428,1909.359717
30892,35.704176,51.40028,331.0,37.275731,49.584392,254.0,1.67,118,29,7,1,5,1926,1515,1848.294458,1835.204644
45277,35.699924,51.396715,447.0,32.665899,51.663805,285.0,19.0,83,3,7,3,7,801,1515,1687.644636,1834.858819
16398,30.199563,53.182966,809.0,35.699078,51.401589,525.0,4.0,151,7,15,1,5,1410,463,1834.989335,1606.102332
13653,27.180941,56.277756,1144.0,34.643252,50.877469,750.0,2.0,85,17,21,1,5,1176,72,1762.56098,1529.682365


# Initial models 

In [4]:
start_time = time.time()
GBoost_1 = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_2 = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_1.fit(train_1.drop(['ID','price'],axis=1), train_1.price)
GBoost_2.fit(train_2.drop(['ID','price'],axis=1), train_2.price)

y_gboost_1 = GBoost_2.predict(train_1.drop(['ID','price'],axis=1))
y_gboost_2 = GBoost_1.predict(train_2.drop(['ID','price'],axis=1))

train_1['y_gboost'] = y_gboost_1
train_2['y_gboost'] = y_gboost_2

train_gboost = pd.concat([train_1, train_2])
print( '%.2f' % float((time.time() - start_time)/60 ) )
train_gboost.head()

3.33


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,date_ids,SourceState_ids,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids,source,destination,y_gboost
13789,74350561550,32.419834,47.826776,191.0,800000.0,33.119891,46.166214,139.0,2.0,155,5,5,1,6,886,825,1529.019976,1550.536139,1168977.0
35708,83373090511,36.303322,59.609054,820.0,7800000.0,35.344187,52.063917,532.0,10.0,181,13,9,2,0,1329,1738,1840.156818,2164.006681,4313321.0
45169,16346579545,27.47131,52.611697,1288.0,7700000.0,35.700071,51.39564,880.0,3.5,96,7,6,1,6,1410,126,1834.827997,1445.312238,9379299.0
11521,61928501573,32.670303,51.660395,487.0,3500000.0,29.610508,52.531104,348.0,10.0,18,15,3,2,0,339,874,1555.472675,1687.760758,3822512.0
1564,13309545979,30.239987,57.065322,1185.0,6000000.0,36.120016,48.592136,799.0,1.4,82,12,25,1,6,1533,468,1755.14873,1725.654595,6046491.0


In [5]:
train_gboost_1, train_gboost_2 = train_test_split(train_gboost, test_size=0.5)

In [6]:
start_time = time.time()
xgb_1 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_2 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_1.fit(train_gboost_1.drop(['ID','price','y_gboost'],axis=1), train_gboost_1.price)
xgb_2.fit(train_gboost_2.drop(['ID','price','y_gboost'],axis=1), train_gboost_2.price)

y_xgb_1 = xgb_2.predict(train_gboost_1.drop(['ID','price','y_gboost'],axis=1))
y_xgb_2 = xgb_1.predict(train_gboost_2.drop(['ID','price','y_gboost'],axis=1))

train_gboost_1['y_xgb'] = y_xgb_1
train_gboost_2['y_xgb'] = y_xgb_2

train_xgb = pd.concat([train_gboost_1, train_gboost_2])
print( '%.2f' % float((time.time() - start_time)/60 ) )
train_xgb.head()

77.49


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,date_ids,SourceState_ids,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids,source,destination,y_gboost,y_xgb
5425,15985271267,35.70211,51.396252,604.0,3314000.0,37.33224,46.052747,412.0,3.0,139,0,7,1,6,1936,1515,1719.252204,1834.954642,2912466.0,2720363.5
33894,38974457764,36.558926,53.062677,43.0,1915000.0,36.680226,53.423271,44.0,14.81,138,19,19,0,3,1722,1804,1959.577654,1939.914482,2246340.0,2590922.5
40184,61390983337,32.666259,51.6685,448.0,4379000.0,35.699081,51.40121,288.0,13.75,60,7,3,0,0,1410,874,1834.975959,1687.816603,4445857.0,4806778.5
26266,88774668208,30.361588,48.260514,282.0,3000000.0,32.463244,48.347229,215.0,3.0,172,11,11,2,0,755,488,1569.507892,1465.265843,2285547.0,2536633.25
19200,37459526665,32.864512,59.213398,986.0,2400000.0,35.342857,52.074544,675.0,3.0,180,13,8,1,5,1326,923,1840.463162,1946.019429,3540089.0,2985460.25


In [7]:
train_xgb_1, train_xgb_2 = train_test_split(train_xgb, test_size=0.5)

In [8]:
start_time = time.time()

bag_1 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)
bag_2 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)

bag_1.fit(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_1.price)
bag_2.fit(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_2.price)

y_bag_1 = bag_2.predict(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1))
y_bag_2 = bag_1.predict(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1))

train_xgb_1['y_bag'] = y_bag_1
train_xgb_2['y_bag'] = y_bag_2

train_bag = pd.concat([train_xgb_1, train_xgb_2])
print( '%.2f' % float((time.time() - start_time)/60 ) )
train_bag.head()

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s finished


8.12


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s finished


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,date_ids,...,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids,source,destination,y_gboost,y_xgb,y_bag
47700,18610264925,31.318638,48.673657,785.0,2200000.0,28.752108,54.544384,611.0,2.0,33,...,11,1,6,227,639,1568.26602,1524.392644,3872536.0,3423346.0,3542874.0
6510,97244800568,32.665483,51.659719,177.0,3500000.0,33.550219,50.552827,123.0,7.0,54,...,3,2,0,947,874,1696.058417,1687.489673,3198221.0,3125487.0,3282378.0
1187,80389797076,34.645603,50.877117,737.0,8000000.0,30.408413,55.991083,471.0,24.04,112,...,17,3,3,424,1239,1702.599976,1762.668397,9878752.0,9980152.0,9928883.0
7140,56802397411,34.097074,49.691927,126.0,1950000.0,33.550791,50.560535,105.0,1.9,40,...,20,1,8,947,1129,1696.345943,1694.349312,2446974.0,2347857.75,1673857.0
5120,31320349114,35.578482,53.384016,829.0,4800000.0,37.344314,46.055906,563.0,4.0,139,...,13,1,6,1938,1477,1719.926215,1899.322252,4255027.0,4452989.5,4823793.0


# TensorFlow combination  

In [71]:
all_longitudes = set(all_data['sourceLongitude'].tolist() + all_data['destinationLongitude'].tolist())
all_latitude   = set(all_data['sourceLatitude'].tolist() + all_data['destinationLatitude'].tolist())

binned_long = np.linspace(min(all_longitudes), max(all_longitudes), BIN_GRANULARITY).tolist()
binned_lat  = np.linspace(min(all_latitude), max(all_latitude), BIN_GRANULARITY).tolist()

y_gboost_feat = tf.feature_column.numeric_column("y_gboost")
y_xgb_feat    = tf.feature_column.numeric_column("y_xgb")
y_bag_feat    = tf.feature_column.numeric_column("y_bag")

source_lat_feat         = tf.feature_column.numeric_column("sourceLatitude") 
source_long_feat        = tf.feature_column.numeric_column("sourceLongitude") 
destin_lat_feat         = tf.feature_column.numeric_column("destinationLatitude") 
destin_long_feat        = tf.feature_column.numeric_column("destinationLongitude") 

binned_source_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=source_lat_feat,
                              boundaries= binned_lat)
binned_source_long_feat = tf.feature_column.bucketized_column(
                              source_column=source_long_feat,
                              boundaries= binned_long)
binned_destin_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=destin_lat_feat,
                              boundaries= binned_lat)
binned_destin_long_feat = tf.feature_column.bucketized_column(
                              source_column=destin_long_feat,
                              boundaries= binned_long)

source_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_source_lat_feat, binned_source_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

destin_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_destin_lat_feat, binned_destin_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

distance_feat = tf.feature_column.numeric_column("distanceKM")
taximin_feat  = tf.feature_column.numeric_column("taxiDurationMin")
weight_feat   = tf.feature_column.numeric_column("weight")

date_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("date_ids", 186),8)

source_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("SourceState_ids", 31),5)

destin_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destinationState_ids", 31),5)

veh_type_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleType_ids", 4),2)

veh_option_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleOption_ids", 9),4)

source_feat   = tf.feature_column.numeric_column("source")
destin_feat   = tf.feature_column.numeric_column("destination")

destination_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destination_tuple_ids", 2191),20)
 
source_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("source_tuple_ids", 2191),20)

feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, source_lat_x_long, destin_lat_x_long, 
                   distance_feat, taximin_feat,
                   weight_feat, date_feat, source_state_feat, destin_state_feat,
                   veh_type_feat, veh_option_feat, source_feat, destin_feat}#,

#feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat}


#                   destination_tuple_feat, source_tuple_feat}

In [72]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, 
                                                feature_columns=feature_columns)
    
    global_step = tf.train.get_or_create_global_step()

    x = tf.layers.dense(inputs=input_layer,
                        units=HIDDEN_LAYER_1_SIZE,
                        activation=tf.nn.relu,
                        name="first_fully_connected_layer")

    x = tf.layers.dropout(inputs=x,name="first_dropout")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_2_SIZE,
                        activation=tf.nn.relu,
                        name="second_fully_connected_layer")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_3_SIZE,
                        activation=tf.nn.relu,
                        name="third_fully_connected_layer")

    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)

    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss)
    else:
        #loss = tf.losses.absolute_difference(labels=labels,
        #                                    predictions=predictions)
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, 
                                      global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          predictions=predictions,
                                          loss=loss, 
                                          train_op=train_op)

In [73]:
def input_fn(df, pred = False):
        
    useful_fueatures = [
        np.array(df["y_gboost"].values, dtype=np.float32),
        np.array(df["y_xgb"].values, dtype=np.float32),
        np.array(df["y_bag"].values, dtype=np.float32),
        np.array(df["sourceLatitude"].values, dtype=np.float32),
        np.array(df["sourceLongitude"].values, dtype=np.float32),
        np.array(df["destinationLatitude"].values, dtype=np.float32),
        np.array(df["destinationLongitude"].values, dtype=np.float32),
        np.array(df["distanceKM"].values, dtype=np.float32),
        np.array(df["taxiDurationMin"].values, dtype=np.float32),
        np.array(df["weight"].values, dtype=np.float32),
        np.array(df["date_ids"].values, dtype=np.int32),
        np.array(df["SourceState_ids"].values, dtype=np.int32),
        np.array(df["destinationState_ids"].values, dtype=np.int32),
        np.array(df["vehicleType_ids"].values, dtype=np.int32),
        np.array(df["vehicleOption_ids"].values, dtype=np.int32),
        np.array(df["source"].values, dtype=np.float32),
        np.array(df["destination"].values, dtype=np.float32),
        np.array(df["destination_tuple_ids"].values, dtype=np.int32),
        np.array(df["source_tuple_ids"].values, dtype=np.int32)
    ]

    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )
    
    y_gboost              = A[0]
    y_xgb                 = A[1]
    y_bag                 = A[2]
    sourceLatitude        = A[3]
    sourceLongitude       = A[4]
    destinationLatitude   = A[5]
    destinationLongitude  = A[6]
    distanceKM            = A[7]
    taxiDurationMin       = A[8] 
    weight                = A[9]
    date_ids              = A[10]
    SourceState_ids       = A[11]
    destinationState_ids  = A[12]
    vehicleType_ids       = A[13]
    vehicleOption_ids     = A[14]
    source                = A[15]
    destination           = A[16] 
    destination_tuple_ids = A[17] 
    source_tuple_ids      = A[18] 
    
    # Created a dict out of sliced input producers
    dataset_dict = dict(
        y_gboost=y_gboost,
        y_xgb=y_xgb,
        y_bag=y_bag,
        sourceLatitude=sourceLatitude,
        sourceLongitude=sourceLongitude,
        destinationLatitude=destinationLatitude,
        destinationLongitude=destinationLongitude, 
        distanceKM=distanceKM,
        taxiDurationMin=taxiDurationMin,
        weight=weight,
        date_ids=date_ids,
        SourceState_ids=SourceState_ids,
        destinationState_ids=destinationState_ids,
        vehicleType_ids=vehicleType_ids,
        vehicleOption_ids=vehicleOption_ids,
        source=source, 
        destination=destination,
        #destination_tuple_ids=destination_tuple_ids,
        #source_tuple_ids=source_tuple_ids,
    )

    if not pred:
        dataset_dict['labels'] = A[19]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [74]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator_val.train(input_fn=lambda: input_fn(train_bag), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp7etlu69a', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1d238480b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1d23848320>

# Test dataset augmentation

In [77]:
start_time = time.time()
GBoost = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, loss='huber')

xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1, nthread = -1)

bag = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)

GBoost.fit(train.drop(['ID','price'],axis=1), train.price)
xgb_model.fit(train.drop(['ID','price'],axis=1), train.price)
bag.fit(train.drop(['ID','price'],axis=1), train.price)

test['y_gboost'] = GBoost.predict(test.drop(['ID','price'],axis=1))
test['y_xgb']    = xgb_model.predict(test.drop(['ID','price','y_gboost'],axis=1))
test['y_bag']    = bag.predict(test.drop(['ID','price','y_gboost','y_xgb'],axis=1))

print( '%.2f' % float((time.time() - start_time)/60 ) )
test.head()

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.5min finished


15.49


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s finished


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,date_ids,...,destinationState_ids,vehicleType_ids,vehicleOption_ids,source_tuple_ids,destination_tuple_ids,source,destination,y_gboost,y_xgb,y_bag
0,10010571124,35.579635,53.38499,684.0,,36.297213,59.60797,446.0,2.33,184,...,13,1,5,1594,1477,2163.603184,1899.418459,2394293.0,2257574.0,2666167.0
1,10031704713,29.605761,52.533588,931.0,,35.704695,51.405194,614.0,19.14,77,...,15,3,2,1410,380,1835.406773,1555.296851,9998914.0,9769796.0,9434639.0
2,10040911649,36.299593,59.61201,1469.0,,26.94849,55.583875,1009.0,22.0,51,...,9,3,2,33,1734,1497.9015,2163.891701,20039060.0,14994570.0,18011204.0
3,10047106840,35.248298,58.457567,745.0,,35.339066,52.07597,496.0,2.5,176,...,9,1,6,1326,1358,1840.316141,2060.529742,1754239.0,2022942.0,2639174.0
4,10050126039,34.636832,50.874888,281.0,,35.579577,53.394403,181.0,23.5,142,...,17,3,4,1387,1239,1899.750273,1762.144949,6099514.0,5995565.0,7125671.0


In [78]:
predictions   = list(estimator_val.predict(input_fn = lambda: input_fn(test, pred=True)))
y_preds_test   = [int(x) for x in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp7etlu69a/model.ckpt-3000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# Save to File

In [79]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission19.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")

##### Submission 19 with loss of 15.9 