In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf 

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Data gathering

In [3]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

data      = data.dropna(axis = 0)

test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

min_price = min(all_data['price'])

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

train_1, train_2 = train_test_split(train, test_size=0.5)

X = train.drop(['ID','price'],axis=1)
y = train.price

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,SourceState_آذربایجان شرقی,...,vehicleType_treili,vehicleOption_bari,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali
39085,36.473089,52.349822,184.0,35.700109,51.399743,199.0,21.0,1834.976428,1909.359717,0,...,1,0,0,1,0,0,0,0,0,0
30892,35.704176,51.40028,331.0,37.275731,49.584392,254.0,1.67,1848.294458,1835.204644,0,...,0,0,0,0,0,0,1,0,0,0
45277,35.699924,51.396715,447.0,32.665899,51.663805,285.0,19.0,1687.644636,1834.858819,0,...,1,0,0,0,0,0,0,0,1,0
16398,30.199563,53.182966,809.0,35.699078,51.401589,525.0,4.0,1834.989335,1606.102332,0,...,0,0,0,0,0,0,1,0,0,0
13653,27.180941,56.277756,1144.0,34.643252,50.877469,750.0,2.0,1762.56098,1529.682365,0,...,0,0,0,0,0,0,1,0,0,0


# Initial models 

In [4]:
start_time = time.time()
GBoost_1 = GradientBoostingRegressor(n_estimators=3200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_2 = GradientBoostingRegressor(n_estimators=3200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_1.fit(train_1.drop(['ID','price'],axis=1), train_1.price)
GBoost_2.fit(train_2.drop(['ID','price'],axis=1), train_2.price)

train_1['y_gboost'] = GBoost_2.predict(train_1.drop(['ID','price'],axis=1))
train_2['y_gboost'] = GBoost_1.predict(train_2.drop(['ID','price'],axis=1))

train_gboost = pd.concat([train_1, train_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins')
train_gboost.head()

# 5.85 mins

5.85 mins


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_bari,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost
49768,23297266068,34.320769,47.075424,504.0,11000000.0,35.7048,51.399356,365.0,25.0,1835.203726,...,0,0,1,0,0,0,0,0,0,9406621.0
26257,24666836676,37.273008,49.580559,985.0,4360000.0,36.207511,57.680956,679.0,2.5,2088.483849,...,0,0,0,0,0,0,1,0,0,4506976.0
12083,88720247959,36.47101,52.356618,855.0,7990000.0,37.762406,45.978292,622.0,10.65,1736.25093,...,1,0,0,0,0,0,0,0,0,8703010.0
4248,48875502547,36.598663,52.669759,366.0,4350000.0,36.276806,49.999858,331.0,3.0,1813.835149,...,0,0,0,0,0,0,1,0,0,4493476.0
43710,63810005613,34.797502,48.514603,420.0,3000000.0,37.342321,46.065287,376.0,4.5,1720.184734,...,0,0,0,0,0,0,1,0,0,2803532.0


In [5]:
train_gboost_1, train_gboost_2 = train_test_split(train_gboost, test_size=0.5)

In [6]:
start_time = time.time()
xgb_1 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_2 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_1.fit(train_gboost_1.drop(['ID','price','y_gboost'],axis=1), train_gboost_1.price)
xgb_2.fit(train_gboost_2.drop(['ID','price','y_gboost'],axis=1), train_gboost_2.price)

train_gboost_1['y_xgb'] = xgb_2.predict(train_gboost_1.drop(['ID','price','y_gboost'],axis=1))
train_gboost_2['y_xgb'] = xgb_1.predict(train_gboost_2.drop(['ID','price','y_gboost'],axis=1))

train_xgb = pd.concat([train_gboost_1, train_gboost_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins')
train_xgb.head()

# 8.20 mins

8.20 mins


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb
23015,83049470900,35.80863,51.001842,637.0,9000000.0,37.753519,45.977046,407.0,20.0,1735.79528,...,0,0,0,0,0,0,0,1,8961475.0,10434036.0
32647,83127357213,35.9878,45.89099,318.0,3000000.0,38.082268,46.295973,301.0,2.5,1763.055651,...,0,0,0,0,1,0,0,0,3323744.0,3579260.5
19443,20780793497,34.322574,47.069063,98.0,1300000.0,34.506334,47.966206,80.0,10.0,1655.137925,...,0,0,1,0,0,0,0,0,1258236.0,1331070.75
25298,35477235742,31.887375,54.361779,538.0,3800000.0,34.478216,50.455482,344.0,6.0,1739.615007,...,0,0,0,0,0,0,0,0,3944955.0,3364734.5
22301,41986159735,34.798985,48.520619,199.0,2076000.0,35.016052,50.352218,141.0,3.0,1763.135884,...,0,0,0,0,0,1,0,0,2156841.0,1965191.25


In [7]:
train_xgb_1, train_xgb_2 = train_test_split(train_xgb, test_size=0.5)

In [8]:
start_time = time.time()

bag_1 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)
bag_2 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)

bag_1.fit(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_1.price)
bag_2.fit(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_2.price)

train_xgb_1['y_bag'] = bag_2.predict(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1))
train_xgb_2['y_bag'] = bag_1.predict(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1))

train_bag = pd.concat([train_xgb_1, train_xgb_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )
train_bag.head()

# 10.23 mins

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.3s finished


10.23 mins


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.5s finished


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag
40069,86593168407,34.638805,50.881637,236.0,6000000.0,34.296169,48.814855,179.0,10.0,1674.162517,...,0,0,0,0,0,1,0,5339283.0,6432316.0,6039017.0
48788,62001128890,35.697016,51.399206,656.0,2800000.0,36.211458,57.686632,450.0,2.0,2088.917052,...,0,0,0,1,0,0,0,2519239.0,2662429.5,2656585.0
30153,60933250542,36.287262,50.001567,197.0,2000000.0,34.781497,50.511619,146.0,3.0,1756.869725,...,0,0,0,0,1,0,0,1867591.0,1767595.25,2004030.0
21496,41480151810,37.101866,58.504261,358.0,2430000.0,37.233528,55.194441,284.0,2.5,2055.083764,...,0,0,0,0,1,0,0,2114702.0,2304076.5,2244695.0
22324,89164203564,31.328759,48.673584,546.0,7300000.0,34.091192,49.684713,400.0,10.0,1693.81109,...,0,0,0,0,0,0,0,6684340.0,7142756.5,5408889.0


In [9]:
train_bag.to_pickle('dataFrames/train_OneHotEncoding.pkl')

In [10]:
train_bag = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_bag_1, train_bag_2 = train_test_split(train_bag, test_size=0.5)

In [11]:
start_time = time.time()

knn_1 = KNeighborsClassifier(2)
knn_2 = KNeighborsClassifier(2)

knn_1.fit(train_bag_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1), train_bag_1.price)
knn_2.fit(train_bag_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1), train_bag_2.price)

train_bag_1['y_knn'] = knn_2.predict(train_bag_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))
train_bag_2['y_knn'] = knn_1.predict(train_bag_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))

train_knn = pd.concat([train_bag_1, train_bag_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )

train_knn.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_knn.head()

# 0.11 mins

0.11 mins


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn
3463,66761395252,28.5416,57.790231,561.0,4680000.0,30.045671,53.751941,431.0,10.37,1615.013135,...,0,0,0,0,0,0,5300222.0,5265694.0,5886188.0,9925000.0
14228,62810333053,27.179198,56.277415,1403.0,13730000.0,36.303287,59.601389,964.0,3.99,2163.72633,...,0,0,0,0,0,0,12746660.0,11290270.0,12401123.0,9500000.0
813,72946753657,35.245778,58.462993,227.0,1200000.0,33.718299,59.180034,163.0,2.5,1995.450081,...,0,0,1,0,0,0,1483658.0,1579684.0,1698176.0,4110000.0
15367,44850947654,38.280847,45.974397,33.0,950000.0,38.432965,45.777126,31.0,2.0,1759.350681,...,0,0,1,0,0,0,872491.1,990291.9,904230.5,950000.0
478,44340802284,36.302202,49.991152,156.0,4800000.0,35.703083,51.404467,121.0,22.0,1835.297952,...,0,0,0,0,1,0,4577497.0,4467607.0,4297389.0,2800000.0


In [12]:
train_knn = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_knn_1, train_knn_2 = train_test_split(train_knn, test_size=0.5)

In [13]:
start_time = time.time()

dec_1 = DecisionTreeRegressor(max_depth=10)
dec_2 = DecisionTreeRegressor(max_depth=10)

dec_1.fit(train_knn_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1), train_knn_1.price)
dec_2.fit(train_knn_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1), train_knn_2.price)

train_knn_1['y_dec'] = dec_2.predict(train_knn_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))
train_knn_2['y_dec'] = dec_1.predict(train_knn_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))

train_dec = pd.concat([train_knn_1, train_knn_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )

train_dec.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_dec.head()

# 0.01 mins

0.01 mins


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn,y_dec
4291,86534025270,35.696345,51.40326,109.0,3100000.0,35.298584,50.502557,93.0,10.0,1782.66875,...,0,0,0,0,0,2130541.0,2514197.0,2311128.0,1815000.0,2269144.0
43436,12576546864,34.100813,49.691077,280.0,3900000.0,35.698071,51.39899,201.0,5.5,1834.844794,...,0,0,0,0,0,3028757.0,3655643.25,3510464.0,3000000.0,2312097.0
17822,24672393815,31.038968,52.837088,246.0,1700000.0,29.611477,52.540679,182.0,2.0,1555.807108,...,0,0,1,0,0,1756237.0,1872448.5,1664921.0,1315000.0,1562029.0
9926,24850294040,34.830196,50.916326,622.0,2000000.0,38.253059,48.293025,477.0,2.5,1847.355935,...,0,0,0,0,0,3765231.0,3968792.5,2732921.0,3600000.0,2853197.0
10491,86742767229,29.610717,52.533788,1521.0,11400000.0,37.554418,45.065357,1034.0,10.0,1692.403254,...,0,0,0,0,0,9972822.0,10271302.0,11028207.0,7860000.0,9393616.0


In [14]:
train_dec = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_dec_1, train_dec_2 = train_test_split(train_dec, test_size=0.5)

In [15]:
start_time = time.time()

lgb_1 = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                          learning_rate=0.05, n_estimators=15000,
                          max_bin = 1000, bagging_fraction = 0.6,
                          bagging_freq = 5, feature_fraction = 0.25,
                          feature_fraction_seed=9, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

lgb_2 = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                          learning_rate=0.05, n_estimators=15000,
                          max_bin = 1000, bagging_fraction = 0.6,
                          bagging_freq = 5, feature_fraction = 0.25,
                          feature_fraction_seed=9, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

lgb_1.fit(train_dec_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1), train_dec_1.price)
lgb_2.fit(train_dec_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1), train_dec_2.price)

train_dec_1['y_lgb'] = lgb_2.predict(train_dec_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))
train_dec_2['y_lgb'] = lgb_1.predict(train_dec_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))

train_lgb = pd.concat([train_dec_1, train_dec_2])
print( '%.2f' % float((time.time() - start_time)/60 ) + ' mins' )

train_lgb.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_lgb.head()

# 2.26 mins

2.26 mins


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn,y_dec,y_lgb
3238,90407729621,34.792949,48.521547,822.0,13200000.0,31.590012,54.448335,525.0,22.0,1720.023556,...,0,0,0,0,12056570.0,11398180.0,11380463.0,10200000.0,11225810.0,12570730.0
17981,68264782570,31.022605,61.494939,212.0,1980000.0,29.491796,60.852201,147.0,2.5,1794.640698,...,1,0,0,0,1831582.0,2177749.75,2162954.0,1980000.0,1993650.0,1470242.0
17610,87807487380,36.294183,59.600583,314.0,3020000.0,37.563871,56.925695,249.0,10.0,2138.349464,...,0,0,0,0,2841686.0,3100074.5,3015542.0,1910000.0,3983793.0,2951464.0
15820,86424538832,38.080342,46.291429,296.0,2700000.0,36.668224,48.476322,196.0,3.33,1777.540634,...,0,1,0,0,2835178.0,2729188.5,2763547.0,2700000.0,2135786.0,2907176.0
381,12039250410,35.694433,51.403848,82.0,1900000.0,35.338534,52.069324,84.0,14.0,1840.053577,...,0,0,0,0,1847225.0,1626582.75,2056184.0,1500000.0,2818182.0,1498263.0


# Test dataset augmentation

In [16]:
start_time = time.time()
GBoost = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, loss='huber')

xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1, nthread = -1)

bag   = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)
knn   = KNeighborsClassifier(2)
dec   = DecisionTreeRegressor(max_depth=10)
lgb_m = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                       learning_rate=0.05, n_estimators=15000,
                       max_bin = 1000, bagging_fraction = 0.6,
                       bagging_freq = 5, feature_fraction = 0.25,
                       feature_fraction_seed=9, bagging_seed=20,
                       min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

GBoost.fit(train.drop(['ID','price'],axis=1), train.price)
xgb_model.fit(train.drop(['ID','price'],axis=1), train.price)
bag.fit(train.drop(['ID','price'],axis=1), train.price)
knn.fit(train.drop(['ID','price'],axis=1), train.price)
dec.fit(train.drop(['ID','price'],axis=1), train.price)
lgb_m.fit(train.drop(['ID','price'],axis=1), train.price)



test['y_gboost'] = GBoost.predict(test.drop(['ID','price'],axis=1))
test['y_xgb']    = xgb_model.predict(test.drop(['ID','price','y_gboost'],axis=1))
test['y_bag']    = bag.predict(test.drop(['ID','price','y_gboost','y_xgb'],axis=1))
test['y_knn']    = knn.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))
test['y_dec']    = dec.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))
test['y_lgb']    = lgb_m.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))

print( '%.2f' % float((time.time() - start_time)/60 ), "mins" )


test.to_pickle('dataFrames/test_OneHotEncoding.pkl')
test.head()

# 25.04 mins

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 10.8min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.2s finished


25.04


Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn,y_dec,y_lgb
0,10010571124,35.579635,53.38499,684.0,,36.297213,59.60797,446.0,2.33,2163.603184,...,1,0,0,0,2516722.0,2403949.0,2603010.0,2200000.0,3997050.0,2469241.0
1,10031704713,29.605761,52.533588,931.0,,35.704695,51.405194,614.0,19.14,1835.406773,...,0,0,0,0,10035440.0,10273780.0,9555599.0,8767000.0,10223290.0,10770800.0
2,10040911649,36.299593,59.61201,1469.0,,26.94849,55.583875,1009.0,22.0,1497.9015,...,0,0,0,0,21285780.0,18779210.0,17803149.0,8687000.0,13000000.0,21634610.0
3,10047106840,35.248298,58.457567,745.0,,35.339066,52.07597,496.0,2.5,1840.316141,...,0,1,0,0,2052077.0,1795059.0,2396458.0,2170000.0,2772043.0,1919848.0
4,10050126039,34.636832,50.874888,281.0,,35.579577,53.394403,181.0,23.5,1899.750273,...,0,0,0,0,5822093.0,5748156.0,7099862.0,1900000.0,5984402.0,5194802.0


# Loading augmented data

In [25]:
test  = pd.read_pickle('dataFrames/test_OneHotEncoding.pkl')
train = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')

# TensorFlow combination  

In [44]:
BATCH_SIZE          = 128
TRAIN_EPOCHS        = 1200
BIN_GRANULARITY     = 100
HIDDEN_LAYER_1_SIZE = 512
HIDDEN_LAYER_2_SIZE = 512
HIDDEN_LAYER_3_SIZE = 16
lr                  = 1e-4

In [45]:
all_longitudes = set(all_data['sourceLongitude'].tolist() + all_data['destinationLongitude'].tolist())
all_latitude   = set(all_data['sourceLatitude'].tolist() + all_data['destinationLatitude'].tolist())

binned_long = np.linspace(min(all_longitudes), max(all_longitudes), BIN_GRANULARITY).tolist()
binned_lat  = np.linspace(min(all_latitude), max(all_latitude), BIN_GRANULARITY).tolist()

y_gboost_feat = tf.feature_column.numeric_column("y_gboost")
y_xgb_feat    = tf.feature_column.numeric_column("y_xgb")
y_bag_feat    = tf.feature_column.numeric_column("y_bag")
y_knn_feat    = tf.feature_column.numeric_column("y_knn")
y_dec_feat    = tf.feature_column.numeric_column("y_dec")
y_lgb_feat    = tf.feature_column.numeric_column("y_lgb")

source_lat_feat         = tf.feature_column.numeric_column("sourceLatitude") 
source_long_feat        = tf.feature_column.numeric_column("sourceLongitude") 
destin_lat_feat         = tf.feature_column.numeric_column("destinationLatitude") 
destin_long_feat        = tf.feature_column.numeric_column("destinationLongitude") 

binned_source_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=source_lat_feat,
                              boundaries= binned_lat)
binned_source_long_feat = tf.feature_column.bucketized_column(
                              source_column=source_long_feat,
                              boundaries= binned_long)
binned_destin_lat_feat  = tf.feature_column.bucketized_column(
                              source_column=destin_lat_feat,
                              boundaries= binned_lat)
binned_destin_long_feat = tf.feature_column.bucketized_column(
                              source_column=destin_long_feat,
                              boundaries= binned_long)

source_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_source_lat_feat, binned_source_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

destin_lat_x_long = tf.feature_column.embedding_column(tf.feature_column.crossed_column(
                    keys=[binned_destin_lat_feat, binned_destin_long_feat], 
                    hash_bucket_size=BIN_GRANULARITY *BIN_GRANULARITY),dimension=BIN_GRANULARITY)

distance_feat = tf.feature_column.numeric_column("distanceKM")
taximin_feat  = tf.feature_column.numeric_column("taxiDurationMin")
weight_feat   = tf.feature_column.numeric_column("weight")

date_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("date_ids", 186),8)

source_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("SourceState_ids", 31),5)

destin_state_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destinationState_ids", 31),5)

veh_type_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleType_ids", 4),2)

veh_option_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("vehicleOption_ids", 9),4)

source_feat   = tf.feature_column.numeric_column("source")
destin_feat   = tf.feature_column.numeric_column("destination")

destination_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("destination_tuple_ids", 2191),20)
 
source_tuple_feat = tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity("source_tuple_ids", 2191),20)

feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_knn_feat, y_dec_feat, y_lgb_feat, 
                   source_lat_x_long, destin_lat_x_long, distance_feat, taximin_feat,
                   weight_feat, date_feat, source_state_feat, destin_state_feat,
                   veh_type_feat, veh_option_feat, source_feat, destin_feat}#,

feature_columns = {y_gboost_feat, y_xgb_feat, y_bag_feat, y_knn_feat, y_dec_feat, y_lgb_feat}

In [46]:
def make_model(features, labels, mode, params, config):
    input_layer = tf.feature_column.input_layer(features=features, 
                                                feature_columns=feature_columns)
    
    global_step = tf.train.get_or_create_global_step()

    x = tf.layers.dense(inputs=input_layer,
                        units=HIDDEN_LAYER_1_SIZE,
                        activation=tf.nn.relu,
                        name="first_fully_connected_layer")

    x = tf.layers.dropout(inputs=x,name="first_dropout")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_2_SIZE,
                        activation=tf.nn.relu,
                        name="second_fully_connected_layer")

    x = tf.layers.dense(inputs=x,
                        units=HIDDEN_LAYER_3_SIZE,
                        activation=tf.nn.relu,
                        name="third_fully_connected_layer")

    predictions = tf.contrib.layers.fully_connected(inputs=x, num_outputs=1)

    if mode == tf.estimator.ModeKeys.PREDICT :
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss)
    else:
        #loss = tf.losses.absolute_difference(labels=labels,
        #                                    predictions=predictions)
        loss  = tf.reduce_mean(tf.abs(tf.divide(predictions-labels,labels))) 
        tf.summary.scalar("Loss", loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        train_op = optimizer.minimize(loss, 
                                      global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          predictions=predictions,
                                          loss=loss, 
                                          train_op=train_op)

In [47]:
def input_fn(df, pred = False):
        
    useful_fueatures = [
        np.array(df["y_gboost"].values, dtype=np.float32),
        np.array(df["y_xgb"].values, dtype=np.float32),
        np.array(df["y_bag"].values, dtype=np.float32),
        np.array(df["y_knn"].values, dtype=np.float32),
        np.array(df["y_dec"].values, dtype=np.float32),
        np.array(df["y_lgb"].values, dtype=np.float32),
        np.array(df["sourceLatitude"].values, dtype=np.float32),
        np.array(df["sourceLongitude"].values, dtype=np.float32),
        np.array(df["destinationLatitude"].values, dtype=np.float32),
        np.array(df["destinationLongitude"].values, dtype=np.float32),
        np.array(df["distanceKM"].values, dtype=np.float32),
        np.array(df["taxiDurationMin"].values, dtype=np.float32),
        np.array(df["weight"].values, dtype=np.float32),
        #np.array(df["date_ids"].values, dtype=np.int32),
        #np.array(df["SourceState_ids"].values, dtype=np.int32),
        #np.array(df["destinationState_ids"].values, dtype=np.int32),
        #np.array(df["vehicleType_ids"].values, dtype=np.int32),
        #np.array(df["vehicleOption_ids"].values, dtype=np.int32),
        #np.array(df["source"].values, dtype=np.float32),
        #np.array(df["destination"].values, dtype=np.float32),
        #np.array(df["destination_tuple_ids"].values, dtype=np.int32),
        #np.array(df["source_tuple_ids"].values, dtype=np.int32)
    ]

    if pred: 
        train_number = 1
        batch_number = 1
    else:
        useful_fueatures.append(np.array(df["price"].values, dtype=np.float32))
        train_number = TRAIN_EPOCHS
        batch_number = BATCH_SIZE
        
    A = tf.train.slice_input_producer(
        tensor_list=useful_fueatures,
        num_epochs=train_number,
        shuffle= not pred,
        capacity=BATCH_SIZE * 5
    )
    
    y_gboost              = A[0]
    y_xgb                 = A[1]
    y_bag                 = A[2]
    y_knn                 = A[3]
    y_dec                 = A[4]
    y_lgb                 = A[5]
    sourceLatitude        = A[6]
    sourceLongitude       = A[7]
    destinationLatitude   = A[8]
    destinationLongitude  = A[9]
    distanceKM            = A[10]
    taxiDurationMin       = A[11] 
    weight                = A[12]
    #date_ids              = A[13]
    #SourceState_ids       = A[14]
    #destinationState_ids  = A[15]
    #vehicleType_ids       = A[16]
    #vehicleOption_ids     = A[17]
    #source                = A[18]
    #destination           = A[19] 
    #destination_tuple_ids = A[20] 
    #source_tuple_ids      = A[21] 
    
    # Created a dict out of sliced input producers
    dataset_dict = dict(
        y_gboost=y_gboost,
        y_xgb=y_xgb,
        y_bag=y_bag,
        y_knn=y_knn,
        y_dec=y_dec,
        y_lgb=y_lgb,
        sourceLatitude=sourceLatitude,
        sourceLongitude=sourceLongitude,
        destinationLatitude=destinationLatitude,
        destinationLongitude=destinationLongitude, 
        distanceKM=distanceKM,
        taxiDurationMin=taxiDurationMin,
        weight=weight,
        #date_ids=date_ids,
        #SourceState_ids=SourceState_ids,
        #destinationState_ids=destinationState_ids,
        #vehicleType_ids=vehicleType_ids,
        #vehicleOption_ids=vehicleOption_ids,
        #source=source, 
        #destination=destination,
        #destination_tuple_ids=destination_tuple_ids,
        #source_tuple_ids=source_tuple_ids,
    )

    if not pred:
        dataset_dict['labels'] = A[13]
            
    batch_dict = tf.train.batch(
        dataset_dict,
        batch_number,
   )

    if pred == False:
        batch_labels = batch_dict.pop('labels')
        return batch_dict, tf.reshape(batch_labels, [-1, 1]) 
    else:
        return batch_dict 

In [48]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_gboost,y_xgb,y_bag,y_knn,y_dec,y_lgb
40689,10602550191,34.319566,47.078555,1172.0,19000000.0,27.474105,52.603738,859.0,22.0,1445.240621,...,0,0,0,0,20013150.0,20377512.0,20693972.0,4290000.0,18835250.0,22143660.0
28663,59022077023,35.699056,51.402792,911.0,8000000.0,29.617603,51.652078,661.0,5.35,1529.81074,...,0,0,0,0,5602654.0,7494018.5,4657211.0,6000000.0,4240625.0,7224867.0
19042,74752147720,27.182853,56.273862,980.0,10338000.0,32.801625,51.689466,667.0,23.61,1695.49848,...,0,0,0,0,9197673.0,9348608.0,9439628.0,8305000.0,9854376.0,10296610.0
21837,76223312658,32.673139,51.670482,200.0,2320000.0,34.136752,50.566116,142.0,11.3,1726.162961,...,0,0,0,0,2536652.0,2578700.75,2509670.0,2320000.0,2347938.0,3293571.0
35006,74609642925,35.699332,51.395552,935.0,18000000.0,30.435378,49.111278,659.0,22.0,1494.72031,...,0,0,0,0,17376430.0,17874336.0,17712615.0,16650000.0,18786820.0,16928930.0


In [49]:
hparams = tf.contrib.training.HParams(learning_rate=lr)
estimator_val = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator_val.train(input_fn=lambda: input_fn(X_train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp7rv2pc0f', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1e40345470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1e41c5d668>

In [50]:
predictions_val   = list(estimator_val.predict(input_fn = lambda: input_fn(X_val, pred=True)))
y_preds_val       = [int(x) for x in predictions_val]
mean_absolute_precision_error(y_preds_val, X_val.price)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmp7rv2pc0f/model.ckpt-1200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


16.752548073280483

In [51]:
estimator = tf.estimator.Estimator(model_fn=make_model, params=hparams)
estimator.train(input_fn=lambda: input_fn(train), steps=TRAIN_EPOCHS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpb664dzxe', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1e40345710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/hy

<tensorflow.python.estimator.estimator.Estimator at 0x1e3f0ba9b0>

In [52]:
predictions   = list(estimator.predict(input_fn = lambda: input_fn(test, pred=True)))
y_preds_test   = [int(x) for x in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpb664dzxe/model.ckpt-1200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# Save to File

In [53]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission32.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")

##### Submission 19 with loss of 15.9 