In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import datetime
import pandas as pd 
import numpy as np 
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn import model_selection 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing
#from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, Ridge, LinearRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from mlxtend.regressor import StackingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
from mpl_toolkits.mplot3d import Axes3D




import statsmodels.api as sm
from scipy.stats import spearmanr
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mape_fuc(labels, preds):
    return np.mean(np.abs((preds - labels)/(labels))) * 100

mape_score = make_scorer(mape_fuc)

# Data gathering 

In [3]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

# Remove NANs
data      = data.dropna(axis = 0)

# Remove outliers
data.drop([28098])
THRESHOLD = 4.5e7
Aa = data[data.price > THRESHOLD]
data = data.drop(Aa.index.tolist())

specific_cols = ['distanceKM', 'taxiDurationMin', 'weight']
removed_indices = []
for col in specific_cols:
    df = data['price']/data[col]
    A = df[~df.isin([np.nan, np.inf, -np.inf])]
    B = (A - np.mean(A)) / np.std(A)
    V = B[B > 5]
    removed_indices.extend(V.index.tolist())
data = data.drop(set(removed_indices))

# Fill test NANs
test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

feat_cols  = train.drop(['ID','price'],axis=1).columns.tolist()

train.shape

(49557, 86)

In [4]:
train.head()

Unnamed: 0,ID,destinationLatitude,destinationLongitude,distanceKM,price,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,...,vehicleType_treili,vehicleOption_bari,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali
0,88142929797,29.600574,52.537114,1092.0,15300000.0,36.666045,48.489706,751.0,20.0,1777.925742,...,1,0,0,1,0,0,0,0,0,0
1,30842979862,34.644923,50.876092,414.0,1800000.0,32.323951,50.855412,264.0,2.5,1643.847846,...,0,0,0,0,0,0,0,1,0,0
2,54262798716,31.819508,49.865235,310.0,3742000.0,32.575448,51.581011,292.0,14.97,1680.274542,...,0,0,0,0,1,0,0,0,0,0
3,64071173752,35.703801,51.398824,391.0,1300000.0,35.029685,48.085763,271.0,2.5,1684.429131,...,0,0,0,0,0,0,0,1,0,0
4,68088966447,36.730367,53.96548,756.0,8870000.0,31.586965,54.449607,573.0,15.0,1719.897831,...,0,1,0,0,0,0,0,0,0,0


# Model definition

In [None]:
rdg1 = Ridge()#random_state=5)
knn1 = KNeighborsClassifier(2)
lass1 = Lasso(fit_intercept = True)#, random_state=5)
entf1 = make_pipeline(RobustScaler(), ElasticNet(alpha=0.8, l1_ratio=.9))#, random_state=5))

xgb1 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                        learning_rate=0.01, max_depth=12, 
                        min_child_weight=1.7817, n_estimators=8000,
                        reg_alpha=0.9640, reg_lambda=0.8571,
                        subsample=1, silent=1,nthread = -1)
#                        random_state =5 , 

gbst1 = GradientBoostingRegressor(n_estimators=15000, learning_rate=0.01,
                                  max_depth=10, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10, 
                                  loss='huber')#, random_state = 5)

lgb1  = lgb.LGBMRegressor(objective='regression',num_leaves=25, save_binary = True,  
                          learning_rate=0.005, n_estimators=120000, #random_state= 5, 
                          max_bin = 150, bagging_fraction = 0.95,
                          bagging_freq = 4, feature_fraction = 0.8,
                          feature_fraction_seed=50, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

bag1 = BaggingRegressor(n_estimators=600, max_samples=1.0, max_features=0.9,verbose=1)# random_state=5, 

dec1 = DecisionTreeRegressor(criterion='mae', splitter='best', max_depth=16, min_samples_split=20,
                             min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_features=None, 
                             max_leaf_nodes=None, min_impurity_decrease=0.0, #random_state=5,
                             min_impurity_split=None, presort=False)

rfst1 = RandomForestRegressor(n_estimators=20, criterion='mae', 
                              max_depth=4, max_features='sqrt',
                              min_samples_leaf=5, min_samples_split =3)#, random_state = 5)

adbst1 = AdaBoostRegressor(n_estimators=1, learning_rate=0.01, loss='linear')#, random_state=5)

In [None]:
models_all = { "Gboost": gbst1, "xgb": xgb1, "bagging": bag1, "lgbm": lgb1, "dec_tree": dec1, "Random_forest": rfst1,
          "knn": knn1, "elasticNet": entf1, "ridge": rdg1, "lasso": lass1, "AdaBoost": adbst1}

models_base  = {"knn       ": knn1,   "elasticNet": entf1, "ridge     ": rdg1, "lasso     ": lass1
               ,"AdaBoost  ": adbst1, "dec_tree  ": dec1,  "lgbm      ": lgb1, "RndmForest": rfst1 
               ,"bagging   ": bag1,   "xgb       ": xgb1,  "Gboost    ": gbst1 
               }

models_final = {"knn       ": knn1,   "elasticNet": entf1, "ridge     ": rdg1, "lasso     ": lass1
               ,"AdaBoost  ": adbst1, "dec_tree  ": dec1,  "lgbm      ": lgb1, "RndmForest": rfst1 
               ,"bagging   ": bag1,   "xgb       ": xgb1,  "Gboost    ": gbst1 
               }

# Training data augmentation

In [None]:
pred_cols = []

for model_name in models_base:
    model1 = models_base[model_name]
    model2 = models_base[model_name]
    
    pred_cols.append('y_' + model_name)

    start_time = time.time()
    print("Started training    " + model_name + "        at time: ", datetime.datetime.now())
    
    train1, train2 = train_test_split(train, test_size=0.5)
    
    model1.fit(train1.drop(['ID','price'],axis=1), train1.price)
    model2.fit(train2.drop(['ID','price'],axis=1), train2.price)
    
    train2['y_' + model_name] = model1.predict(train2.drop(['ID','price'],axis=1))
    train1['y_' + model_name] = model2.predict(train1.drop(['ID','price'],axis=1))
    
    train = pd.concat([train1, train2])
    print("Done training       " +model_name, '       in %.2f' % float((time.time() - start_time)/60 ) +" mins")
    
train.to_pickle('dataFrames/updated_train_June_22.pkl')

# Results with random_state=5 
#Started training    knn               at time:  2018-06-21 17:40:03.117144
#Done training       knn               in 0.11 mins
#Started training    elasticNet        at time:  2018-06-21 17:40:09.494914
#Done training       elasticNet        in 0.02 mins
#Started training    ridge             at time:  2018-06-21 17:40:10.439206
#Done training       ridge             in 0.00 mins
#Started training    lasso             at time:  2018-06-21 17:40:10.728227
#Done training       lasso             in 0.13 mins
#Started training    AdaBoost          at time:  2018-06-21 17:40:18.251842
#Done training       AdaBoost          in 0.01 mins
#Started training    dec_tree          at time:  2018-06-21 17:40:18.692906
#Done training       dec_tree          in 4.10 mins
#Started training    lgbm              at time:  2018-06-21 17:44:24.569371
#Done training       lgbm              in 26.13 mins
#Started training    RndmForest        at time:  2018-06-21 18:10:32.619772
#Done training       RndmForest        in 3.93 mins
#Started training    bagging           at time:  2018-06-21 18:14:28.656966
#Done training       bagging           in 10.20 mins
#Started training    xgb               at time:  2018-06-21 18:24:40.703583
#Done training       xgb               in 40.93 mins
#Started training    Gboost            at time:  2018-06-21 19:06:57.921625
#Done training       Gboost            in 26.36 mins


# Results without random_state set 
#Started training    knn               at time:  2018-06-22 02:40:50.818880
#Done training       knn               in 0.11 mins
#Started training    elasticNet        at time:  2018-06-22 02:40:57.468835
#Done training       elasticNet        in 0.01 mins
#Started training    ridge             at time:  2018-06-22 02:40:58.343920
#Done training       ridge             in 0.01 mins
#Started training    lasso             at time:  2018-06-22 02:40:58.683624
#Done training       lasso             in 0.13 mins
#Started training    AdaBoost          at time:  2018-06-22 02:41:06.314617
#Done training       AdaBoost          in 0.01 mins
#Started training    dec_tree          at time:  2018-06-22 02:41:06.767423
#Done training       dec_tree          in 4.05 mins
#Started training    lgbm              at time:  2018-06-22 02:45:09.520694
#Done training       lgbm              in 24.48 mins
#Started training    RndmForest        at time:  2018-06-22 03:09:38.057324
#Done training       RndmForest        in 3.74 mins
#Started training    bagging           at time:  2018-06-22 03:13:22.515465
#Done training       bagging           in 9.86 mins
#Started training    xgb               at time:  2018-06-22 03:23:14.246388
#Done training       xgb               in 38.54 mins
#Started training    Gboost            at time:  2018-06-22 04:01:46.897449
#Done training       Gboost            in 23.86 mins

# Test data augmentation

In [None]:
train      = pd.read_pickle('dataFrames/updated_train_June_22.pkl')

for model_name in models_base:
    model = models_base[model_name]
    
    start_time = time.time()
    print("Started adding    " + model_name + "        to the test data at time: ", datetime.datetime.now() )
    
    model.fit(train[feat_cols] , train.price)
    test['y_' + model_name] = model.predict(test[feat_cols])

    print("Done adding       " +model_name, '       to the test data in %.2f' % float((time.time() - start_time)/60 ) +" mins")
    
test.to_pickle('dataFrames/updated_test_June_22.pkl')

# Results with random_state=5 
#Started adding    knn               to the test data at time:  2018-06-21 19:38:28.442436
#Done adding       knn               to the test data in 0.04 mins
#Started adding    elasticNet        to the test data at time:  2018-06-21 19:38:30.695390
#Done adding       elasticNet        to the test data in 0.02 mins
#Started adding    ridge             to the test data at time:  2018-06-21 19:38:31.623906
#Done adding       ridge             to the test data in 0.00 mins
#Started adding    lasso             to the test data at time:  2018-06-21 19:38:31.771744
#Done adding       lasso             to the test data in 0.12 mins
#Started adding    AdaBoost          to the test data at time:  2018-06-21 19:38:38.948664
#Done adding       AdaBoost          to the test data in 0.01 mins
#Started adding    dec_tree          to the test data at time:  2018-06-21 19:38:39.335052
#Done adding       dec_tree          to the test data in 7.59 mins
#Started adding    lgbm              to the test data at time:  2018-06-21 19:46:14.511570
#Done adding       lgbm              to the test data in 13.24 mins
#Started adding    RndmForest        to the test data at time:  2018-06-21 19:59:28.829603
#Done adding       RndmForest        to the test data in 9.03 mins
#Started adding    bagging           to the test data at time:  2018-06-21 20:08:30.364155
#Done adding       bagging           to the test data in 5.77 mins
#Started adding    xgb               to the test data at time:  2018-06-21 20:14:16.633645
#Done adding       xgb               to the test data in 32.06 mins
#Started adding    Gboost            to the test data at time:  2018-06-21 20:46:20.278354
#Done adding       Gboost            to the test data in 30.96 mins


# Results without random_state set 
#Started adding    knn               to the test data at time:  2018-06-22 04:25:38.845725
#Done adding       knn               to the test data in 0.04 mins
#Started adding    elasticNet        to the test data at time:  2018-06-22 04:25:41.066799
#Done adding       elasticNet        to the test data in 0.01 mins
#Started adding    ridge             to the test data at time:  2018-06-22 04:25:41.954959
#Done adding       ridge             to the test data in 0.00 mins
#Started adding    lasso             to the test data at time:  2018-06-22 04:25:42.074074
#Done adding       lasso             to the test data in 0.12 mins
#Started adding    AdaBoost          to the test data at time:  2018-06-22 04:25:49.228870
#Done adding       AdaBoost          to the test data in 0.01 mins
#Started adding    dec_tree          to the test data at time:  2018-06-22 04:25:49.595628
#Done adding       dec_tree          to the test data in 7.34 mins
#Started adding    lgbm              to the test data at time:  2018-06-22 04:33:09.923394
#Done adding       lgbm              to the test data in 12.15 mins
#Started adding    RndmForest        to the test data at time:  2018-06-22 04:45:18.979901
#Done adding       RndmForest        to the test data in 8.15 mins
#Started adding    bagging           to the test data at time:  2018-06-22 04:53:28.114233
#Done adding       bagging           to the test data in 5.62 mins
#Started adding    xgb               to the test data at time:  2018-06-22 04:59:05.126341
#Done adding       xgb               to the test data in 31.17 mins
#Started adding    Gboost            to the test data at time:  2018-06-22 05:30:15.310855
#Done adding       Gboost            to the test data in 30.37 mins

In [None]:
train      = pd.read_pickle('dataFrames/updated_train_June_22.pkl')

X_all = train.drop(['ID','price'],axis=1)
y_all = train.price
for model_name in models_final:
    model = models_final[model_name]
    X = X_all.drop(['y_' + model_name], axis=1)
    start_time = time.time()
    print("Started training the final model,    " + model_name + "        at time: ", datetime.datetime.now())
    score = model_selection.cross_val_score(model, X, y_all, scoring=mape_score, cv = 3)
    print("Done training the final model,       " +model_name, '       in %.2f' % float((time.time() - start_time)/60 ) +" mins, score= ", '%.2f' % score.mean())

# Results with random_state=5 
#Started training the final model,    knn               at time:  2018-06-21 21:44:26.142029
#Done training the final model,       knn               in 0.21 mins, score=  6.19
#Started training the final model,    elasticNet        at time:  2018-06-21 21:44:38.991180
#Done training the final model,       elasticNet        in 0.13 mins, score=  6.75
#Started training the final model,    ridge             at time:  2018-06-21 21:44:46.841943
#Done training the final model,       ridge             in 0.01 mins, score=  4.54
#Started training the final model,    lasso             at time:  2018-06-21 21:44:47.179869
#Done training the final model,       lasso             in 0.27 mins, score=  4.54
#Started training the final model,    AdaBoost          at time:  2018-06-21 21:45:03.127976
#Done training the final model,       AdaBoost          in 0.02 mins, score=  16.75
#Started training the final model,    dec_tree          at time:  2018-06-21 21:45:04.065791
#Done training the final model,       dec_tree          in 12.48 mins, score=  4.32
#Started training the final model,    lgbm              at time:  2018-06-21 21:57:32.929486
#Done training the final model,       lgbm              in 39.12 mins, score=  5.15
#Started training the final model,    RndmForest        at time:  2018-06-21 22:36:39.860109
#Done training the final model,       RndmForest        in 10.39 mins, score=  14.57
#Started training the final model,    bagging           at time:  2018-06-21 22:47:03.556538
#Done training the final model,       bagging           in 22.74 mins, score=  4.41
#Started training the final model,    xgb               at time:  2018-06-21 23:09:48.032251
#Done training the final model,       xgb               in 79.95 mins, score=  4.93
#Started training the final model,    Gboost            at time:  2018-06-22 00:29:44.866651
#Done training the final model,       Gboost            in 52.85 mins, score=  5.36


# Results without random_state set 
#Started training the final model,    knn               at time:  2018-06-22 06:00:37.899922
#Done training the final model,       knn               in 0.21 mins, score=  6.15
#Started training the final model,    elasticNet        at time:  2018-06-22 06:00:50.245410
#Done training the final model,       elasticNet        in 0.14 mins, score=  6.70
#Started training the final model,    ridge             at time:  2018-06-22 06:00:58.689795
#Done training the final model,       ridge             in 0.00 mins, score=  4.52
#Started training the final model,    lasso             at time:  2018-06-22 06:00:58.939183
#Done training the final model,       lasso             in 0.27 mins, score=  4.53
#Started training the final model,    AdaBoost          at time:  2018-06-22 06:01:14.859141
#Done training the final model,       AdaBoost          in 0.02 mins, score=  16.99
#Started training the final model,    dec_tree          at time:  2018-06-22 06:01:15.808657
#Done training the final model,       dec_tree          in 12.17 mins, score=  4.30
#Started training the final model,    lgbm              at time:  2018-06-22 06:13:25.991858

# Final Training

In [None]:
train      = pd.read_pickle('dataFrames/updated_train_June_22.pkl')
test       = pd.read_pickle('dataFrames/updated_test_June_22.pkl')

In [None]:
selected_final_model = 'dec_tree  '
selected_final_model = 'bagging   '
selected_final_model = 'ridge     '

X      = train.drop(['ID','price', 'y_'+selected_final_model], axis=1)
y      = train.price
X_test = test.drop(['ID','price', 'y_'+selected_final_model], axis=1)

model  = models_final[selected_final_model]

start_time = time.time()
print("Started fitting the selected final model,    " + selected_final_model +\
      "        at time: ", datetime.datetime.now())
model.fit(X, y)
y_pred_test = model.predict(X_test)
print("Done fitting the selected final model,       " + selected_final_model, \
      '       in %.2f' % float((time.time() - start_time)/60 ) +" mins")

# Results with random_state=5 
#Started fitting the selected final model,    dec_tree          at time:  2018-06-22 01:25:39.366947
#Done fitting the selected final model,       dec_tree          in 9.59 mins 
# dec_tree resulted in a final public leaderboard score of 16.50   =====> submission69.csv

#Started fitting the selected final model,    bagging           at time:  2018-06-22 01:44:57.771315
#Done fitting the selected final model,       bagging           in 12.72 mins
# bagging  resulted in a final public leaderboard score of 16.51   =====> submission70.csv

#Started fitting the selected final model,    ridge             at time:  2018-06-22 02:00:34.867528
#Done fitting the selected final model,       ridge             in 0.00 mins
# ridge    resulted in a final public leaderboard score of 16.34   =====> submission71.csv


# Results without random_state set 
#Started fitting the selected final model,    ridge             at time:  2018-06-22 09:20:03.496934
#Done fitting the selected final model,       ridge             in 0.00 mins
# ridge    resulted in a final public leaderboard score of 16.32   =====> submission72.csv


# Preventing overfitting in augmenting test data 

In [None]:
#train          = pd.read_pickle('dataFrames/updated_train_June_22.pkl')
#train1, train2 = train_test_split(train, test_size=0.8)

#for model_name in models_base:
#    model = models_base[model_name]
#    
#    start_time = time.time()
#    print("Started adding    " + model_name + "        to the test data at time: ", datetime.datetime.now() )
#    
#    model.fit(train1[feat_cols] , train1.price)
#    test['y_' + model_name] = model.predict(test[feat_cols])
#
#    print("Done adding       " +model_name, '       to the test data in %.2f' % float((time.time() - start_time)/60 ) +" mins")
#    
#test.to_pickle('dataFrames/updated_test_June_22_with_splitting_train_data.pkl')

#Started adding    knn               to the test data at time:  2018-06-22 12:44:01.855999
#Done adding       knn               to the test data in 0.02 mins
#Started adding    elasticNet        to the test data at time:  2018-06-22 12:44:03.149277
#Done adding       elasticNet        to the test data in 0.00 mins
#Started adding    ridge             to the test data at time:  2018-06-22 12:44:03.328871
#Done adding       ridge             to the test data in 0.00 mins
#Started adding    lasso             to the test data at time:  2018-06-22 12:44:03.381281
#Done adding       lasso             to the test data in 0.02 mins
#Started adding    AdaBoost          to the test data at time:  2018-06-22 12:44:04.307421
#Done adding       AdaBoost          to the test data in 0.00 mins
#Started adding    dec_tree          to the test data at time:  2018-06-22 12:44:04.396390
#Done adding       dec_tree          to the test data in 0.28 mins
#Started adding    lgbm              to the test data at time:  2018-06-22 12:44:21.483120
#Done adding       lgbm              to the test data in 11.24 mins
#Started adding    RndmForest        to the test data at time:  2018-06-22 12:55:36.140693
#Done adding       RndmForest        to the test data in 0.31 mins
#Started adding    bagging           to the test data at time:  2018-06-22 12:55:54.811298
#Done adding       bagging           to the test data in 0.95 mins
#Started adding    xgb               to the test data at time:  2018-06-22 12:56:51.752518
#Done adding       xgb               to the test data in 6.75 mins
#Started adding    Gboost            to the test data at time:  2018-06-22 13:03:36.976831
#Done adding       Gboost            to the test data in 5.15 mins

In [None]:
#X_all = train2.drop(['ID','price'],axis=1)
#y_all = train2.price
#for model_name in models_final:
#    model = models_final[model_name]
#    X = X_all.drop(['y_' + model_name], axis=1)
#    start_time = time.time()
#    print("Started training the final model,    " + model_name + "        at time: ", datetime.datetime.now())
#    score = model_selection.cross_val_score(model, X, y_all, scoring=mape_score, cv = 3)
#    print("Done training the final model,       " +model_name, '       in %.2f' % float((time.time() - start_time)/60) +" mins, score= ", '%.2f' % score.mean())
    
#Started training the final model,    knn               at time:  2018-06-22 13:12:05.399169
#Done training the final model,       knn               in 0.15 mins, score=  6.25
#Started training the final model,    elasticNet        at time:  2018-06-22 13:12:14.505529
#Done training the final model,       elasticNet        in 0.11 mins, score=  6.69
#Started training the final model,    ridge             at time:  2018-06-22 13:12:20.873534
#Done training the final model,       ridge             in 0.00 mins, score=  4.22
#Started training the final model,    lasso             at time:  2018-06-22 13:12:21.000378
#Done training the final model,       lasso             in 0.22 mins, score=  4.22
#Started training the final model,    AdaBoost          at time:  2018-06-22 13:12:33.962768
#Done training the final model,       AdaBoost          in 0.01 mins, score=  16.81
#Started training the final model,    dec_tree          at time:  2018-06-22 13:12:34.672086
#Done training the final model,       dec_tree          in 7.91 mins, score=  4.08
#Started training the final model,    lgbm              at time:  2018-06-22 13:20:29.435804    

In [None]:
#selected_final_model = 'dec_tree  '
#selected_final_model = 'bagging   '
#selected_final_model = 'ridge     '

#X      = train.drop(['ID','price', 'y_'+selected_final_model], axis=1)
#y      = train.price
#X_test = test.drop(['ID','price', 'y_'+selected_final_model], axis=1)

#model  = models_final[selected_final_model]

#start_time = time.time()
#print("Started fitting the selected final model,    " + selected_final_model +\
#      "        at time: ", datetime.datetime.now())
#model.fit(X, y)
#y_pred_test = model.predict(X_test)
#print("Done fitting the selected final model,       " + selected_final_model, \
#      '       in %.2f' % float((time.time() - start_time)/60 ) +" mins")

# Save to file

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission73.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(y_pred_test.shape[0]):
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_pred_test[i])))+"\n")