In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import datetime
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def fitmodel(model, X_tr, y_tr, X_v, y_v):
    start_time = time.time()
    print(datetime.datetime.now())
    model.fit(X_tr,y_tr)
    y_pred = model.predict(X_v)
    score = np.mean(np.abs((y_v - y_pred) / y_v)) * 100
    print( '%.2f' % float((time.time() - start_time)/60 ) +" mins, score= ", '%.2f' % score)
    #return score

# Data gathering 

In [3]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

# Remove NANs
data      = data.dropna(axis = 0)

# Remove outliers
data.drop([28098])
THRESHOLD = 3.5e7
Aa = data[data.price > THRESHOLD]
data = data.drop(Aa.index.tolist())

specific_cols = ['distanceKM', 'taxiDurationMin', 'weight']
removed_indices = []
for col in specific_cols:
    df = data['price']/data[col]
    A = df[~df.isin([np.nan, np.inf, -np.inf])]
    B = (A - np.mean(A)) / np.std(A)
    V = B[B > 5]
    removed_indices.extend(V.index.tolist())
data = data.drop(set(removed_indices))

# Fill test NANs
test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

X = train.drop(['ID','price'],axis=1)
y = train.price

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,SourceState_آذربایجان شرقی,...,vehicleType_treili,vehicleOption_bari,vehicleOption_hichkodam,vehicleOption_kafi,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali
3144,36.297494,59.605923,0.0,36.297494,59.605923,0.0,22.0,2163.545632,2163.545632,0,...,1,0,0,0,0,0,0,0,1,0
21290,36.301404,59.606267,1451.0,28.502316,53.556507,1031.0,4.0,1526.484486,2163.791179,0,...,0,0,0,0,0,0,0,0,0,1
19568,36.293036,59.604464,615.0,36.839502,54.428057,457.0,4.0,2005.102515,2163.226958,0,...,0,0,1,0,0,0,0,0,0,0
40708,30.397047,55.99667,1066.0,36.653551,51.497869,754.0,2.1,1887.579768,1702.13341,0,...,0,0,0,0,0,0,1,0,0,0
45211,34.796312,46.931152,62.0,34.326654,47.068694,58.0,25.0,1615.710773,1633.031008,0,...,1,0,0,1,0,0,0,0,0,0


In [None]:
#gbm0 = GradientBoostingRegressor(random_state=5, learning_rate=0.1,  min_samples_leaf = 1,  
#                                 max_features = 'sqrt', n_estimators = 170, loss='huber')
#param_test1 = {'min_samples_split':range(200,1001,200),
#              'max_depth':range(40,70,10)}
#gsearch1 = GridSearchCV(estimator = gbm0, param_grid = param_test1, cv = 2)
#modelfit(gbm0, X_train, y_train , X_val, y_val, printFeatureImportance=False)

In [None]:
#xgb0 = xgb.XGBRegressor(colsample_bytree=.8,  
#                        learning_rate=0.1, max_depth=12, 
#                        n_estimators=200,
#                        random_state =5)

# Initial Modelling

In [9]:
gbstf0 = GradientBoostingRegressor(n_estimators=15000, learning_rate=0.01,
                                  max_depth=10, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10, 
                                  loss='huber', random_state = 42)

In [4]:
xgb0 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                        learning_rate=0.01, max_depth=12, 
                        min_child_weight=1.7817, n_estimators=8000,
                        reg_alpha=0.9640, reg_lambda=0.8571,
                        subsample=1, silent=1,
                        random_state =5 , nthread = -1)

In [5]:
lgb0 = lgb.LGBMRegressor(objective='regression',num_leaves=25, save_binary = True,  
                          learning_rate=0.01, n_estimators=60000,
                          max_bin = 150, bagging_fraction = 0.95,
                          bagging_freq = 4, feature_fraction = 0.8,
                          feature_fraction_seed=50, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

In [6]:
LGBF = lgb.LGBMRegressor(objective='regression',num_leaves=25, save_binary = True,  
                          learning_rate=0.01, n_estimators=60000,
                          max_bin = 150, bagging_fraction = 0.95,
                          bagging_freq = 4, feature_fraction = 0.8,
                          feature_fraction_seed=50, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

GBSTF = GradientBoostingRegressor(n_estimators=15000, learning_rate=0.01,
                                  max_depth=10, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10, 
                                  loss='huber', random_state = 42)

XGBF = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                        learning_rate=0.01, max_depth=12, 
                        min_child_weight=1.7817, n_estimators=8000,
                        reg_alpha=0.9640, reg_lambda=0.8571,
                        subsample=1, silent=1,
                        random_state =5 , nthread = -1)

BAGF  = BaggingRegressor(n_estimators=100, max_samples=1.0, max_features=1.0, random_state=5, verbose=1)

DECF  = DecisionTreeRegressor(max_depth=15)
RFSTF = RandomForestRegressor(n_estimators=20, criterion='mae', 
                              max_depth=4, max_features='sqrt',
                              min_samples_leaf=5, min_samples_split =3, random_state = 42)

KNNF  = KNeighborsClassifier(2)
LASSF = Lasso(fit_intercept = True)
ABSTF = AdaBoostRegressor(n_estimators=1000, learning_rate=0.05, loss='linear', random_state=5)

SVRF = SVR()
RDGF = Ridge()
ENTF = make_pipeline(RobustScaler(), ElasticNet(alpha=0.8, l1_ratio=.9, random_state=3))

models = { "Gboost": GBSTF, "xgb": XGBF, "bagging": BAGF, "lgbm": LGBF, "dec_tree": DECF, "Random_forest": RFSTF,
          "knn": KNNF, "elasticNet": ENTF, "ridge": RDGF, "lasso": LASSF, "AdaBoost": ABSTF, "SVR": SVRF}

#models = { "Gboost": GBSTF, "xgb": XGBF, "bagging": BAGF, "lgbm": LGBF}
 
models = {"lgb0": lgb0, "xgb0": xgb0, "gbstf0": gbstf0}

for model_name in models:
    model = models[model_name]
    start_time = time.time()
    print(datetime.datetime.now())
    model.fit(X_train, y_train)
    train_cols = X_train.columns.tolist()
    X_val['y_' + model_name] = model.predict(X_val[train_cols])
    score = mean_absolute_precision_error(X_val['y_' + model_name], y_val)
    print(model_name, '%.2f' % float((time.time() - start_time)/60 ) +" mins, score= ", '%.2f' % score)

#X_val.to_pickle('dataFrames/One_Hot_X_val.pkl')

# Correct outputs are as followed

#Gboost        23.55 mins, score=  16.76
#xgb           25.40 mins, score=  17.36
#bagging       0.83  mins, score=  18.82
#lgbm          5.92  mins, score=  17.33
#dec_tree      0.01  mins, score=  22.44
#Random_forest 5.56  mins, score=  39.04
#knn           0.02  mins, score=  24.62
#elasticNet    0.01  mins, score=  34.81
#ridge         0.00  mins, score=  36.68
#lasso         0.10  mins, score=  36.63
#AdaBoost      2.58  mins, score=  67.02
#SVR           2.94  mins, score=  72.61
# KernelRidge and LinearRegression() take more than 1 hour to run! Don't Run them. 

2018-06-14 19:09:20.082256
lgb0 5.97 mins, score=  17.33
2018-06-14 19:15:17.991234
xgb0 24.90 mins, score=  17.36


In [10]:
gbstf0.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='huber', max_depth=10,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=15000,
             presort='auto', random_state=42, subsample=1.0, verbose=0,
             warm_start=False)

In [11]:
train_cols = X_train.columns.tolist()
X_val['y_avg_boost'] = gbstf0.predict(X_val[train_cols])
score = mean_absolute_precision_error(X_val['y_avg_boost'], y_val)
print(model_name, '%.2f' % float((time.time() - start_time)/60 ) +" mins, score= ", '%.2f' % score)

xgb0 157.07 mins, score=  16.76


In [19]:
X_val['y_gbstf0'] =  gbstf0.predict(X_val[train_cols])

In [20]:
X_val.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,SourceState_آذربایجان شرقی,...,vehicleOption_kompressi,vehicleOption_labehdar,vehicleOption_mosaghaf_chadori,vehicleOption_mosaghaf_felezi,vehicleOption_transit_chadori,vehicleOption_yakhchali,y_lgb0,y_xgb0,y_avg_boost,y_gbstf0
6273,35.6269,52.155742,506.0,32.800696,51.69913,327.0,15.0,1695.767447,1858.147405,0,...,0,0,0,0,0,0,6472644.0,5927568.0,5826879.0,5826879.0
15353,31.890439,54.365067,711.0,36.044303,50.537302,456.0,3.0,1821.581826,1733.725853,0,...,0,0,1,0,0,0,4404189.0,4183810.0,4119352.0,4119352.0
20645,33.636871,46.425809,905.0,32.252003,54.014301,700.0,24.0,1742.069398,1561.618948,0,...,0,0,0,0,0,0,16655810.0,16558360.0,16114580.0,16114580.0
32807,29.610726,52.54244,216.0,28.863114,54.160527,168.0,4.0,1563.241465,1555.819794,0,...,0,0,0,1,0,0,1416599.0,1462954.0,1409869.0,1409869.0
28418,27.176657,56.275919,1279.0,35.699569,51.39584,836.0,8.33,1834.809336,1529.391348,0,...,0,0,0,0,0,0,6866195.0,6833764.0,6698271.0,6698271.0


In [26]:
X_e = (X_val['y_lgb0'] + X_val['y_xgb0'])/2

In [38]:
X_h = ( X_val['y_xgb0'] * X_val['y_gbstf0'])**(1/2)

In [39]:
score = mean_absolute_precision_error(X_h, y_val)
score

16.805865324960063

# Plotting predictions

In [None]:
X_val = pd.read_pickle('dataFrames/One_Hot_X_val_new.pkl')
X_val.head()

In [None]:
pred_cols = ['y_Gboost', 'y_xgb', 'y_bagging', 'y_lgbm', 'y_dec_tree', 'y_Random_forest', 'y_knn',
             'y_elasticNet', 'y_ridge', 'y_lasso', 'y_AdaBoost', 'y_SVR']

best_cols = ['y_Gboost', 'y_xgb', 'y_bagging', 'y_lgbm']
best_pred = X_val[list(best_cols)]
best_pred.head()

In [None]:
for col in pred_cols:
    plt.scatter(X_val[col],y_val)
    plt.xlabel('Price')
    plt.ylabel(col)
    plt.title('Prediction vs Price')
    plt.show()

# Stacking 

In [12]:
regressors = [lgb0, xgb0]
stregr     = StackingRegressor(regressors=regressors, meta_regressor=gbstf0)
start_time = time.time()
stregr.fit(X_train, y_train)
y_pred = stregr.predict(X_val[train_cols])
score  = mean_absolute_precision_error(y_pred, y_val)
print("stackingRegressor model", '%.2f' % float((time.time() - start_time)/60 ) +" mins, score= ", '%.2f' % score)
# stackingRegressor model 43.59 mins, score=  18.15

stackingRegressor model 43.59 mins, score=  18.15


In [13]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=3):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                #instance.fit(X[train_index], y[train_index])
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [14]:
start_time = time.time() 
stacked_averaged_models  = StackingAveragedModels(base_models = (lgb0, xgb0), meta_model = gbstf0)
stacked_averaged_models.fit(X_train, y_train)
y_pred = stacked_averaged_models.predict(X_val[train_cols])
score  = mean_absolute_precision_error(y_pred, y_val)
print("stacking Averaged models", '%.2f' % float((time.time() - start_time)/60 ) +" mins, score =", '%.2f' % score)
# stacking Averaged models 79.53 mins, score = 17.95

stacking Averaged models 79.53 mins, score = 17.95


In [7]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [8]:
start_time = time.time()
averaged_models = AveragingModels(models = (xgb0, lgb0))
averaged_models.fit(X_train, y_train)
train_cols = X_train.columns.tolist()
y_pred = averaged_models.predict(X_val[train_cols])
score  = mean_absolute_precision_error(y_pred, y_val)
print("Average models", '%.2f' % float((time.time() - start_time)/60 ) +" mins, score =", '%.2f' % score)
# Average models 30.67 mins, score=  17.01

Average models 30.67 mins, score = 17.01


# Final model

In [None]:
# Current best model 
start_time = time.time()
GBST = GradientBoostingRegressor(n_estimators=3200, learning_rate=0.05,
                                  max_depth=10, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10, 
                                  loss='huber', random_state =5)

GBST.fit(train.drop(['ID','price'],axis=1), train.price)
y_pred_test = GBST.predict(test.drop(['ID','price'],axis=1))

print('%.2f' % float((time.time() - start_time)/60 ) +" mins.")

# Save to file

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission32.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(y_pred_test.shape[0]):
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_pred_test[i])))+"\n")