In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Data gathering

In [3]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

data      = data.dropna(axis = 0)

test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

min_price = min(all_data['price'])

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

train_1, train_2 = train_test_split(train, test_size=0.5)

X = train.drop(['ID','price'],axis=1)
y = train.price

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,SourceState_آذربایجان شرقی,...,vehicleType_treili,vehicleOption_bari,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali
39085,36.473089,52.349822,184.0,35.700109,51.399743,199.0,21.0,1834.976428,1909.359717,0,...,1,0,0,1,0,0,0,0,0,0
30892,35.704176,51.40028,331.0,37.275731,49.584392,254.0,1.67,1848.294458,1835.204644,0,...,0,0,0,0,0,0,1,0,0,0
45277,35.699924,51.396715,447.0,32.665899,51.663805,285.0,19.0,1687.644636,1834.858819,0,...,1,0,0,0,0,0,0,0,1,0
16398,30.199563,53.182966,809.0,35.699078,51.401589,525.0,4.0,1834.989335,1606.102332,0,...,0,0,0,0,0,0,1,0,0,0
13653,27.180941,56.277756,1144.0,34.643252,50.877469,750.0,2.0,1762.56098,1529.682365,0,...,0,0,0,0,0,0,1,0,0,0


# Initial models 

In [None]:
start_time = time.time()
GBoost_1 = GradientBoostingRegressor(n_estimators=3200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_2 = GradientBoostingRegressor(n_estimators=3200, learning_rate=0.05,
                                     max_depth=10, max_features='sqrt',
                                     min_samples_leaf=15, min_samples_split=10, loss='huber')

GBoost_1.fit(train_1.drop(['ID','price'],axis=1), train_1.price)
GBoost_2.fit(train_2.drop(['ID','price'],axis=1), train_2.price)

train_1['y_gboost'] = GBoost_2.predict(train_1.drop(['ID','price'],axis=1))
train_2['y_gboost'] = GBoost_1.predict(train_2.drop(['ID','price'],axis=1))

train_gboost = pd.concat([train_1, train_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins')
train_gboost.head()

# 5.85 mins

In [None]:
train_gboost_1, train_gboost_2 = train_test_split(train_gboost, test_size=0.5)

In [None]:
start_time = time.time()
xgb_1 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_2 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=10, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1, nthread = -1)

xgb_1.fit(train_gboost_1.drop(['ID','price','y_gboost'],axis=1), train_gboost_1.price)
xgb_2.fit(train_gboost_2.drop(['ID','price','y_gboost'],axis=1), train_gboost_2.price)

train_gboost_1['y_xgb'] = xgb_2.predict(train_gboost_1.drop(['ID','price','y_gboost'],axis=1))
train_gboost_2['y_xgb'] = xgb_1.predict(train_gboost_2.drop(['ID','price','y_gboost'],axis=1))

train_xgb = pd.concat([train_gboost_1, train_gboost_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins')
train_xgb.head()

# 8.20 mins

In [None]:
train_xgb_1, train_xgb_2 = train_test_split(train_xgb, test_size=0.5)

In [None]:
start_time = time.time()

bag_1 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)
bag_2 = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)

bag_1.fit(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_1.price)
bag_2.fit(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1), train_xgb_2.price)

train_xgb_1['y_bag'] = bag_2.predict(train_xgb_1.drop(['ID','price','y_gboost','y_xgb'],axis=1))
train_xgb_2['y_bag'] = bag_1.predict(train_xgb_2.drop(['ID','price','y_gboost','y_xgb'],axis=1))

train_bag = pd.concat([train_xgb_1, train_xgb_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )
train_bag.head()

# 10.23 mins

In [None]:
train_bag.to_pickle('dataFrames/train_OneHotEncoding.pkl')

In [None]:
train_bag = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_bag_1, train_bag_2 = train_test_split(train_bag, test_size=0.5)

In [None]:
start_time = time.time()

knn_1 = KNeighborsClassifier(2)
knn_2 = KNeighborsClassifier(2)

knn_1.fit(train_bag_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1), train_bag_1.price)
knn_2.fit(train_bag_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1), train_bag_2.price)

train_bag_1['y_knn'] = knn_2.predict(train_bag_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))
train_bag_2['y_knn'] = knn_1.predict(train_bag_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))

train_knn = pd.concat([train_bag_1, train_bag_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )

train_knn.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_knn.head()

# 0.11 mins

In [None]:
train_knn = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_knn_1, train_knn_2 = train_test_split(train_knn, test_size=0.5)

In [None]:
start_time = time.time()

dec_1 = DecisionTreeRegressor(max_depth=10)
dec_2 = DecisionTreeRegressor(max_depth=10)

dec_1.fit(train_knn_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1), train_knn_1.price)
dec_2.fit(train_knn_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1), train_knn_2.price)

train_knn_1['y_dec'] = dec_2.predict(train_knn_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))
train_knn_2['y_dec'] = dec_1.predict(train_knn_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))

train_dec = pd.concat([train_knn_1, train_knn_2])
print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins' )

train_dec.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_dec.head()

# 0.01 mins

In [None]:
train_dec = pd.read_pickle('dataFrames/train_OneHotEncoding.pkl')
train_dec_1, train_dec_2 = train_test_split(train_dec, test_size=0.5)

In [None]:
start_time = time.time()

lgb_1 = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                          learning_rate=0.05, n_estimators=15000,
                          max_bin = 1000, bagging_fraction = 0.6,
                          bagging_freq = 5, feature_fraction = 0.25,
                          feature_fraction_seed=9, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

lgb_2 = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                          learning_rate=0.05, n_estimators=15000,
                          max_bin = 1000, bagging_fraction = 0.6,
                          bagging_freq = 5, feature_fraction = 0.25,
                          feature_fraction_seed=9, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

lgb_1.fit(train_dec_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1), train_dec_1.price)
lgb_2.fit(train_dec_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1), train_dec_2.price)

train_dec_1['y_lgb'] = lgb_2.predict(train_dec_1.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))
train_dec_2['y_lgb'] = lgb_1.predict(train_dec_2.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))

train_lgb = pd.concat([train_dec_1, train_dec_2])
print( '%.2f' % float((time.time() - start_time)/60 ) + ' mins' )

train_lgb.to_pickle('dataFrames/train_OneHotEncoding.pkl')
train_lgb.head()

# 2.26 mins

# Test dataset augmentation

In [None]:
start_time = time.time()
GBoost = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, loss='huber')

xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1, nthread = -1)

bag   = BaggingRegressor(n_estimators=1000, max_samples=1.0, max_features=1.0, verbose=1)
knn   = KNeighborsClassifier(2)
dec   = DecisionTreeRegressor(max_depth=10)
lgb_m = lgb.LGBMRegressor(objective='regression',num_leaves=15,
                       learning_rate=0.05, n_estimators=15000,
                       max_bin = 1000, bagging_fraction = 0.6,
                       bagging_freq = 5, feature_fraction = 0.25,
                       feature_fraction_seed=9, bagging_seed=20,
                       min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

GBoost.fit(train.drop(['ID','price'],axis=1), train.price)
xgb_model.fit(train.drop(['ID','price'],axis=1), train.price)
bag.fit(train.drop(['ID','price'],axis=1), train.price)
knn.fit(train.drop(['ID','price'],axis=1), train.price)
dec.fit(train.drop(['ID','price'],axis=1), train.price)
lgb_m.fit(train.drop(['ID','price'],axis=1), train.price)



test['y_gboost'] = GBoost.predict(test.drop(['ID','price'],axis=1))
test['y_xgb']    = xgb_model.predict(test.drop(['ID','price','y_gboost'],axis=1))
test['y_bag']    = bag.predict(test.drop(['ID','price','y_gboost','y_xgb'],axis=1))
test['y_knn']    = knn.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag'],axis=1))
test['y_dec']    = dec.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn'],axis=1))
test['y_lgb']    = lgb_m.predict(test.drop(['ID','price','y_gboost','y_xgb', 'y_bag', 'y_knn', 'y_dec'],axis=1))

print( '%.2f' % float((time.time() - start_time)/60 ), "mins" )


test.to_pickle('dataFrames/test_OneHotEncoding.pkl')
test.head()

# 25.04 mins