In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import datetime
import pandas as pd 
import numpy as np 
import lightgbm as lgb
import xgboost as xgb
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, Ridge, LinearRegression
from catboost import CatBoostRegressor

In [2]:
def mape_func(labels, preds):
    return np.mean(np.abs((preds - labels)/(labels))) * 100

In [3]:
data      = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Remove NANs
data      = data.dropna(axis = 0)

# Remove outliers
data.drop([28098])
THRESHOLD = 4.5e7
Aa = data[data.price > THRESHOLD]
data = data.drop(Aa.index.tolist())

specific_cols = ['distanceKM', 'taxiDurationMin', 'weight']
removed_indices = []
for col in specific_cols:
    df = data['price']/data[col]
    A = df[~df.isin([np.nan, np.inf, -np.inf])]
    B = (A - np.mean(A)) / np.std(A)
    V = B[B > 5]
    removed_indices.extend(V.index.tolist())
data = data.drop(set(removed_indices))

# Fill test NANs
test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

feat_names = all_data.columns.tolist()
feat_names.remove('ID')
feat_names.remove('price')


#train = pd.read_pickle('dataFrames/train_updated_June27.pkl')
#test  = pd.read_pickle('dataFrames/test_updated_June27.pkl')

In [4]:
res = defaultdict(dict)

def benchmark(est, name=None, cv=False):
    if not name:
        name = est.__class__.__name__
    print("Started benchmarking    " + name + "        at time: ", datetime.datetime.now())
    if not cv:
        t0 = time.time()
        est.fit(X_train, y_train)
        res[name]['train_time'] = (time.time() - t0)/60
        t0 = time.time()
        pred = est.predict(X_val)
        res[name]['test_time'] = (time.time() - t0)/60
        res[name]['MAPE'] = mape_func(y_val, pred)
    else:
        t0 = time.time()
        res[name]['cv_score'] = np.mean(model_selection.cross_val_score(est, X, y, scoring=mape_score, cv = 3))
        res[name]['cv_time'] = (time.time() - t0)/60
    print("Done benchmarking       " + name + "        at time: ", datetime.datetime.now())
    return est

In [5]:
generation_info_train = defaultdict(dict)

def train_feat_generate(est, name=None, split_number=5):
    if not name:
        name = est.__class__.__name__
    print("Started generating feature    " + name + "        at time: ", datetime.datetime.now())
    global train 
    train = shuffle(train)
    A = np.array_split(train, split_number)
    t0 = time.time()
    for i in range(split_number):
        print('Started count   ', i, '    ',  datetime.datetime.now())
        df_train = pd.concat([A[x] for x in range(split_number) if x!=i])
        est.fit(df_train[feat_names], df_train.price)
        A[i][name] = est.predict(A[i][feat_names])
        train = pd.concat([df_train, A[i]])
        print('Done with count ', i,'    ', datetime.datetime.now())
    generation_info_train[name]['generation_time'] = (time.time() - t0)/60
    print("Done generating feature       " + name + "        at time: ", datetime.datetime.now())
    return train 

In [6]:
generation_info_test = defaultdict(dict)

def test_feat_generate(est, name=None):
    if not name:
        name = est.__class__.__name__
    print("Started generating test feature    " + name + "        at time: ", datetime.datetime.now())
    global train
    global test 
    train = shuffle(train)
    t0 = time.time()
    est.fit(train[feat_names], train.price)
    test[name] = est.predict(test[feat_names])
    generation_info_test[name]['generation_time'] = (time.time() - t0)/60
    print("Done generating test feature       " + name + "        at time: ", datetime.datetime.now())
    return test 

In [7]:
ridge        = Ridge(alpha=0.0001, normalize=True)
knn          = KNeighborsRegressor(2)
lasso        = Lasso(fit_intercept = True, random_state=5)
enet         = make_pipeline(RobustScaler(), ElasticNet(alpha=0.8, l1_ratio=.9, random_state=5))
xgboosting   = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                        learning_rate=0.01, max_depth=12, 
                        min_child_weight=1.7817, n_estimators=8000,
                        reg_alpha=0.9640, reg_lambda=0.8571,
                        subsample=1, silent=1,
                        random_state =5 , nthread = -1)
gboosting    = GradientBoostingRegressor(n_estimators=15000, learning_rate=0.01,
                                  max_depth=10, max_features='sqrt',
                                  min_samples_leaf=15, min_samples_split=10, 
                                  loss='huber', random_state = 42)

randomForest = RandomForestRegressor(n_estimators=800, max_features=17, random_state=5, bootstrap=False, n_jobs=10)
extraTree    = ExtraTreesRegressor(n_estimators=3000, max_features=23, random_state=5, bootstrap=False, n_jobs=4)
adaBoost     = AdaBoostRegressor(n_estimators=600, learning_rate=0.01, loss='exponential', random_state=5)
decTree      = DecisionTreeRegressor( splitter='best', max_depth=16, min_samples_split=20, 
                             min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_features=None, 
                             random_state=5, max_leaf_nodes=None,  presort=False)
bagging      = BaggingRegressor(n_estimators=600, max_samples=1.0, max_features=0.9, random_state=5, verbose=1)
lightgbm     = lgb.LGBMRegressor(objective='regression',num_leaves=25, save_binary = True,  
                          learning_rate=0.01, n_estimators=60000,
                          max_bin = 150, bagging_fraction = 0.95,
                          bagging_freq = 4, feature_fraction = 0.8,
                          feature_fraction_seed=50, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

cb_model = CatBoostRegressor(iterations=1200, learning_rate=0.1, depth=15, eval_metric='MAPE', random_seed = 7,
                             bagging_temperature = 1.5, od_type='Iter', metric_period = 100, od_wait=20)

models = [ridge, knn, lasso, enet, xgboosting, gboosting, lightgbm, bagging, decTree, 
          adaBoost, extraTree, randomForest, cb_model]

models = [cb_model]

In [None]:
for model in models:
    train = train_feat_generate(model)
    train.to_pickle('dataFrames/train_updated_June27.pkl')
    train.to_csv('dataFrames/train_updated_June27.csv')

In [None]:
for model in models:
    test = test_feat_generate(model)
    test.to_pickle('dataFrames/test_updated_June27.pkl')
    test.to_csv('dataFrames/test_updated_June27.csv')

In [None]:
mape_func(train.price, train.CatBoostRegressor)

In [None]:
from mlxtend.regressor import StackingRegressor#, StackingCVRegressor

stack_gen = StackingRegressor(regressors=(randomForest, extraTree, decTree, bagging,  
                                          gboosting, xgboosting, lightgbm, cb_model), 
                               meta_regressor=gboosting)

In [None]:
train = shuffle(train)
split_number=3
A = np.array_split(train, split_number)
t0 = time.time()

name = "Stack-gen-fewer"

print("Started generating feature    " + name + "        at time: ", datetime.datetime.now())
for i in range(split_number):
    print('Started count   ', i, '    ',  datetime.datetime.now())
    df_train = pd.concat([A[x] for x in range(split_number) if x!=i])
    stack_gen.fit(df_train[feat_names].as_matrix(), df_train.price.as_matrix())
    A[i][name] = stack_gen.predict(A[i][feat_names].as_matrix())
    train = pd.concat([df_train, A[i]])
    print('Done with count ', i,'    ', datetime.datetime.now())
    
print("Done generating feature       " + name + "        at time: ", datetime.datetime.now())


res_stack_gen = defaultdict(dict)
res_stack_gen[name]['train_generation_time'] =  (time.time() - t0)/60

In [None]:
print("Started generating test feature    " + name + "        at time: ", datetime.datetime.now())
train = shuffle(train)
t0 = time.time()
stack_gen.fit(train[feat_names].as_matrix(), train.price.as_matrix())
test[name] = stack_gen.predict(test[feat_names].as_matrix())
res_stack_gen[name]['test_generation_time'] = (time.time() - t0)/60
print("Done generating test feature       " + name + "        at time: ", datetime.datetime.now())

In [None]:
train.to_pickle('dataFrames/train_updated_June29.pkl')
train.to_csv('dataFrames/train_updated_June29.csv')
test.to_pickle('dataFrames/test_updated_June29.pkl')
test.to_csv('dataFrames/test_updated_June29.csv')

In [None]:
mape_func(train.price, train.StackingRegressor)

In [None]:
mape_func(train.price, train.Stack-gen-fewer)

In [None]:
t0 = time.time()

res_stack_gen[name]['test_time'] = (time.time() - t0)/60
res_stack_gen[name]['MAPE'] = mape_func(y_val, pred)

res_stack_gen_df = pd.DataFrame(data=res).T
res_stack_gen_df.to_pickle('dataFrames/benchmarking_results_stack_gen.pkl')
res_stack_gen_df.sort_values('MAPE')

In [None]:
avg_lgbm_xgb  = StackingAveragedModels(base_models = (lightgbm, xgboosting), meta_model = ridge)

train = train_feat_generate(model)

In [None]:
#stacked_averaged_models  = StackingAveragedModels(base_models = (xgboosting, randomForest), meta_model = ridge)
stacked_averaged_models  = StackingAveragedModels(base_models = (lightgbm, extraTree), meta_model = ridge)
stacking_regressor       = StackingRegressor(regressors=[xgboosting, randomForest], meta_regressor = ridge)