In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
#from scipy.special import boxcox1p
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor 
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
#from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

import itertools
import xgboost as xgb
from sklearn.svm import SVR
import lightgbm as lgb
#from mlxtend.regressor import StackingRegressor

In [2]:
def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_precision_error(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

# Data gathering

In [4]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

# Remove NANs
data      = data.dropna(axis = 0)

# Remove outliers
data.drop([28098])
THRESHOLD = 3.5e7
Aa = data[data.price > THRESHOLD]
data = data.drop(Aa.index.tolist())

specific_cols = ['distanceKM', 'taxiDurationMin', 'weight']
removed_indices = []
for col in specific_cols:
    df = data['price']/data[col]
    A = df[~df.isin([np.nan, np.inf, -np.inf])]
    B = (A - np.mean(A)) / np.std(A)
    V = B[B > 5]
    removed_indices.extend(V.index.tolist())
data = data.drop(set(removed_indices))

# Fill test NANs
test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 
all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

dummies_data = pd.get_dummies(all_data[categorical_vars])
all_data[dummies_data.columns] = dummies_data[dummies_data.columns]
all_data.drop(categorical_vars, axis=1, inplace=True)

train    = all_data[:ntrain]
test     = all_data[ntrain:]

#X = train.drop(['ID','price'],axis=1)
#y = train.price

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train.head()

# Initial models 

In [5]:
GBoost_F = GradientBoostingRegressor(n_estimators=15000, learning_rate=0.01,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, loss='huber')

xgb_F = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                        learning_rate=0.01, max_depth=12, 
                        min_child_weight=1.7817, n_estimators=8000,
                        reg_alpha=0.9640, reg_lambda=0.8571,
                        subsample=1, silent=1, nthread = -1)

lgb_F = lgb.LGBMRegressor(objective='regression',num_leaves=25,  
                          learning_rate=0.01, n_estimators=60000,
                          max_bin = 150, bagging_fraction = 0.95,
                          bagging_freq = 4, feature_fraction = 0.8,
                          feature_fraction_seed=50, bagging_seed=20,
                          min_data_in_leaf = 11, min_sum_hessian_in_leaf = 11)

avg_model_F = AveragingModels(models = (xgb_F, lgb_F))

In [None]:
start_time = time.time()

GBoost_F.fit(train.drop(['ID','price'],axis=1), train.price)
xgb_F.fit(train.drop(['ID','price'],axis=1), train.price)
lgb_F.fit(train.drop(['ID','price'],axis=1), train.price)
avg_model_F.fit(train.drop(['ID','price'],axis=1), train.price)

test['y_gboost']      = GBoost_F.predict(test.drop(['ID','price'],axis=1))
test['y_xgb']         = xgb_F.predict(test.drop(['ID','price','y_gboost'],axis=1))
test['y_lgb']         = lgb_F.predict(test.drop(['ID','price','y_gboost', 'y_xgb'],axis=1))
test['y_avg_lgb_xgb'] = avg_model_F.predict(test.drop(['ID','price','y_gboost', 'y_xgb', 'y_lgb'],axis=1))

test.to_pickle('dataFrames/test_OneHotEncoding_Final_15th.pkl')

print( '%.2f' % float((time.time() - start_time)/60 )  + ' mins')

In [11]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission39_Gboost_best.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(test.shape[0]):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(test.y_gboost[i])))+"\n")

In [12]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission40_Avg_best.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(test.shape[0]):
        outputfile.write(str(test.ID[i])+","+str(int(np.ceil(test.y_avg_lgb_xgb[i])))+"\n")