In [1]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
import keras.backend as K
import time

Using TensorFlow backend.


In [2]:
def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def MAPE_loss(y_true, y_pred):
    return K.mean(K.abs((y_true - y_pred) / y_true)) * 100

In [4]:
def base_model(L):
    model = Sequential()
    model.add(Dense(L[0], input_dim=INPUT_SHAPE, init='lecun_uniform', activation='relu'))
    for i in range(1,len(L)):
        model.add(Dense(L[i], init='lecun_uniform', activation='relu'))
    #model.add(Dense(L[2], init='lecun_uniform', activation='relu'))
    #model.add(Dropout(dropout_rate)) 
    #model.add(Dropout(dropout_rate)) 
    #model.add(Dropout(dropout_rate))
    #model.add(Dense(LAYER_4_SIZE, init='lecun_uniform', activation='relu'))
    #model.add(Dropout(dropout_rate))
    #model.add(Dense(LAYER_5_SIZE, init='normal', activation='relu'))
    model.add(Dense(1, init='lecun_uniform'))
    model.compile(loss=MAPE_loss, optimizer = 'adam')
    return model

In [5]:
test   = pd.read_pickle('dataFrames/test_updated_June29.pkl')
train  = pd.read_pickle('dataFrames/train_updated_June29.pkl')

In [6]:
continuous_cols = ['destinationLatitude', 'destinationLongitude', 'distanceKM', 'sourceLatitude', 
                   'sourceLongitude', 'taxiDurationMin', 'weight', 'source', 'destination'] 

pred_cols = ['XGBRegressor', 'GradientBoostingRegressor', 'BaggingRegressor', 'DecisionTreeRegressor',
             'Ridge', 'KNeighborsRegressor', 'Lasso', 'Pipeline', 'AdaBoostRegressor',
             'ExtraTreesRegressor', 'RandomForestRegressor', 'LGBMRegressor', 
             'CatBoostRegressor', 'StackingRegressor']

vehicle_cols = ['vehicleType_joft', 'vehicleType_khavar', 'vehicleType_tak', 'vehicleType_treili', 
                'vehicleOption_bari', 'vehicleOption_hichkodam', 'vehicleOption_kafi',
                'vehicleOption_kompressi', 'vehicleOption_labehdar','vehicleOption_mosaghaf_chadori', 
                'vehicleOption_mosaghaf_felezi', 'vehicleOption_transit_chadori', 'vehicleOption_yakhchali']

#pred_cols = [ 'y_gboost', 'y_xgb', 'y_lgb', 'y_bag', 'y_avg_lgb_xgb']
#pred_cols = [ 'GradientBoostingRegressor']

categorical_cols = train.columns.drop(continuous_cols + pred_cols + vehicle_cols + ['ID', 'price']).tolist()

NOM = train[categorical_cols].shape[1]
renaming_dict = dict(zip(train[categorical_cols].columns, [str(x) for x in list(range(NOM)) ]))

train_renamed = train[categorical_cols].rename(columns=renaming_dict)
test_renamed  = test[categorical_cols].rename(columns=renaming_dict)

for column in continuous_cols + pred_cols + vehicle_cols:
    train_renamed[column] = train[column]
    test_renamed[column] = test[column]
    
test_renamed['ID']   = test['ID']
train_renamed['ID'] = train['ID']
test_renamed['price'] = test['price']
train_renamed['price'] = train['price']

seed = 7
np.random.seed(seed)

X_train, X_val = train_test_split(train_renamed, test_size=0.2, random_state=42)
X_train.shape

(39645, 100)

In [7]:
feature_importances = pd.read_pickle('dataFrames/catboost_feature_importances') 

In [8]:
def get_features(feature_type,nn=10):
    if feature_type=='all':
        X_input_train = X_train.drop(['ID','price'], axis=1)
        X_input_val   = X_val.drop(['ID','price'], axis=1)
        train_input   = train_renamed.drop(['ID','price'], axis=1)
        test_input   = test_renamed.drop(['ID','price'], axis=1)
    elif feature_type=='all_but_pred':
        X_input_train = X_train.drop(['ID','price'] + pred_cols, axis=1)
        X_input_val   = X_val.drop(['ID','price'] + pred_cols, axis=1)
        train_input   = train_renamed.drop(['ID','price'] + pred_cols, axis=1)
        test_input   = test_renamed.drop(['ID','price'] + pred_cols, axis=1)
    elif feature_type=='all_but_state_cols':
        X_input_train = X_train[continuous_cols + pred_cols + vehicle_cols]
        X_input_val   = X_val[continuous_cols + pred_cols + vehicle_cols]
        train_input   = train_renamed[continuous_cols + pred_cols + vehicle_cols]
        test_input    = test_renamed[continuous_cols + pred_cols + vehicle_cols]
    elif feature_type=='cont_and_pred':
        X_input_train = X_train[continuous_cols + pred_cols]
        X_input_val   = X_val[continuous_cols + pred_cols]
        train_input   = train_renamed[continuous_cols + pred_cols]
        test_input    = test_renamed[continuous_cols + pred_cols]
    elif feature_type=='pred':
        X_input_train = X_train[pred_cols]
        X_input_val   = X_val[pred_cols]
        train_input   = train_renamed[pred_cols]
        test_input    = test_renamed[pred_cols]
    elif feature_type=='cont':
        X_input_train = X_train[continuous_cols]
        X_input_val   = X_val[continuous_cols]
        train_input   = train_renamed[continuous_cols]
        test_input    = test_renamed[continuous_cols]
    elif feature_type=='most_important_features_preds':
        most_important_features = feature_importances[:nn].keys().tolist()
        for item in most_important_features:
            if item not in X_train.columns.tolist():
                most_important_features.remove(item)
                most_important_features.append(renaming_dict[item])
        X_input_train = X_train[most_important_features + pred_cols]
        X_input_val   = X_val[most_important_features + pred_cols]
        train_input   = train_renamed[most_important_features + pred_cols]
        test_input    = test_renamed[most_important_features + pred_cols]
    elif feature_type=='most_important_features':
        most_important_features = feature_importances[:nn].keys().tolist()
        for item in most_important_features:
            if item not in X_train.columns.tolist():
                most_important_features.remove(item)
                most_important_features.append(renaming_dict[item])
        X_input_train = X_train[most_important_features ]
        X_input_val   = X_val[most_important_features]
        train_input   = train_renamed[most_important_features]
        test_input    = test_renamed[most_important_features]
    return X_input_train, X_input_val, train_input, test_input 

In [12]:
# Possible inputs: 'cont' 'pred' 'all' 'all_but_pred' 'cont_and_pred' 'all_but_state_cols' 'most_important_features_preds'
X_input_train, X_input_val, train_input, test_input = get_features('cont_and_pred', 10)
INPUT_SHAPE   = X_input_train.shape[1]
y_input_train = X_train.price

In [13]:
def train_and_test(epochs, batch_size, L):
    clf_val = KerasRegressor(build_fn= lambda: base_model(L),
                             batch_size= batch_size, verbose=1, epochs=epochs)
    clf_val.fit(X_input_train,y_input_train)
    preds_val = clf_val.predict(X_input_val)
    score = mean_absolute_precision_error(preds_val, X_val.price)
    return score 
#print('%.2f' % score)

In [14]:
L = [256, 128, 16]

result = {}
for epoch in [70]:# 10, 20, 50, 80, 100, 120]:
    for batch_size in [128]:
        tmp_list = []
        for i in range(4):
            start_time = time.time()
            tmp_list.append(train_and_test(epoch, batch_size, L))
            print(epoch, batch_size, '%.2f' % ((time.time() - start_time) / 60) + ' mins')
        print("mean =", '%.2f ' %  (np.mean(tmp_list)), 'std = %.2f' %  (np.std(tmp_list)), tmp_list)
        result[(epoch, batch_size)] = tmp_list

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
70 128 0.68 mins
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch

Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
70 128 0.70 mins
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58

In [None]:
result
#{(500, 128): [15.29, 15.32]}     [512,256,16] Epoch=500 Batch=128              Remove ridge, adaboost etc.  
#mean = 15.14  std = 0.05  [15.19, 15.09]   [512,256,16] Epoch=100 Batch=128    with ridge, adaboost etc. 
#mean = 15.24  std = 0.02  [15.26, 15.22]   [512,256,16] Epoch=100 Batch=128    all
#mean = 15.42  std = 0.20  [15.62, 15.22]   [512,256,16] Epoch=100 Batch=128    all_but_state
#mean = 15.33  std = 0.10  [15.23, 15.43]   [512,128,16] Epoch=500 Batch=128    cont_and_pred + CatBoostRegressor 
#mean = 15.27  std = 0.11  [15.38, 15.16]   [512,128,16] Epoch=500 Batch=128    most_important_features_preds 10 all
#mean = 15.06  std = 0.12  [14.94, 15.19]   [512,128,16] Epoch=100 Batch=128    most_important_features_preds 10 all
#mean = 15.10  std = 0.06  [15.04, 15.15]   [512,128,16] Epoch=200 Batch=128    most_important_features_preds 10 all
#mean = 14.99  std = 0.02  [14.97, 15.01]   [512,128,16] Epoch=50  Batch=128    most_important_features_preds 10 all
#mean = 14.97  std = 0.01  [14.98, 14.96]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 10 all
#mean = 15.04  std = 0.04  [14.99, 15.08]   [512,128,16] Epoch=80  Batch=128    most_important_features_preds 10 all
#mean = 15.00  std = 0.08  [15.08, 14.92]   [512,128,16] Epoch=75  Batch=128    most_important_features_preds 10 all
#mean = 14.99  std = 0.02  [14.97, 15.01]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 15 all
#mean = 15.01  std = 0.07  [14.94, 15.08]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 5 all
#mean = 15.05  std = 0.01  [15.04, 15.05]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 17 all
#mean = 15.02  std = 0.02  [15.00, 15.04]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 8 all
#mean = 15.03  std = 0.07  [14.97, 15.10]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 12 all
#mean = 15.01  std = 0.00  [15.01, 15.01]   [512,128,16] Epoch=70  Batch=128    most_important_features_preds 10 all
#mean = 15.01  std = 0.07  [14.94, 15.07]   [512,128,16] Epoch=70  Batch=64     most_important_features_preds 10 all
#mean = 15.10  std = 0.07  [15.22, 15.07, 15.04, 15.05]   [512,128,16] Epoch=70  Batch=32 most_important_features_preds 10 all  
#mean = 14.97  std = 0.01  [14.96, 14.97, 14.98, 14.96]   [512,128,16] Epoch=70  Batch=128 most_important_features_preds 10 all  
#mean = 15.02  std = 0.06  [14.96, 14.96, 15.11, 15.04]   [512,128,16] Epoch=70  Batch=64 most_important_features_preds 10 all  
#mean = 15.07  std = 0.12  [14.95, 15.19]   [512,128,16] Epoch=70  Batch=64 preds  
#mean = 14.95  std = 0.12  [14.91, 14.98]   [512,128,16] Epoch=70  Batch=128 preds  
#mean = 14.99  std = 0.06  [14.96, 15.09, 14.93, 14.97]   [512,128,16] Epoch=70  Batch=128 preds  


In [None]:
#INPUT_SHAPE  = X_input_train.shape[1]
#clf_val = KerasRegressor(build_fn=base_model,  batch_size= 100, verbose=1, epochs=250)
#clf_val.fit(X_input_train,y_input_train)
#preds_val = clf_val.predict(X_input_val)
#score = mean_absolute_precision_error(preds_val, X_val.price)
#print('%.2f' % score)

# Final

In [None]:
INPUT_SHAPE  = train_input.shape[1]

L = [512, 128, 16]
EPOCH = 100
BATCH_SIZE = 128

clf = KerasRegressor(build_fn=lambda: base_model(L), epochs= EPOCH, batch_size=BATCH_SIZE, verbose=1)
clf.fit(train_input,train_renamed.price)

In [None]:
preds2 = clf.predict(X_input_val)
score  = mean_absolute_precision_error(preds2, X_val.price)
print('%.2f' % score)

In [None]:
preds          = clf.predict(test_input)
y_preds_test   = [int(x) for x in preds]

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/solutions/e" + str(EPOCH) + "-b" + str(BATCH_SIZE) \
+ "-" + str(L) + "-s%.2f.csv" % score 
filename = "-".join(filename.split(".")[0:2])+"."+filename.split(".")[2]

filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission83.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(len(y_preds_test)):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(y_preds_test[i])))+"\n")

# GridSearch

In [None]:
#epochs = [50, 100, 200, 300, 400, 500, 600, 700, 1000]  
#batch_size = [50, 100, 200, 300, 400, 500, 600, 1000]

#param_grid = dict(epochs=epochs, batch_size=batch_size)

#clf_test = KerasRegressor(build_fn=base_model, verbose=1)

#start_time = time.time()
#grid = GridSearchCV(estimator=clf_test, param_grid=param_grid, cv=2, n_jobs=1)
#grid_result = grid.fit(train_input, train_renamed.price) 
#print("Best: %.2f using %s" % (grid_result.best_score_, grid_result.best_params_))
#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%.2f (%.2f) with: %r" % (mean, stdev, param))
    
#print('Total of %.2f' % ((time.time() - start_time) / 60) + ' mins')