In [1]:
import warnings 
warnings.filterwarnings('ignore')
import time 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

import xgboost as xgb
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor

In [2]:
def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)

def get_score(model, X, y_true):
    y_pred = model.predict(X)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

def mean_absolute_precision_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def one_hot_encode(dataframe, columns, rem_original_cols=False):
    """
    @param dataframe pandas DataFrame
    @param columns a list of columns to encode 
    @param rem_original_cols if True remove the original column in the resulting dataframe
    @return a DataFrame with one-hot encoding
    """
    for column in columns:
        dummies = pd.get_dummies(dataframe[column], prefix=column, drop_first=False)
        dataframe = pd.concat([dataframe, dummies], axis=1)
        if rem_original_cols:
            dataframe.drop(columns=[column], inplace=True)  
    return dataframe

# Data gathering 

In [4]:
data      = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/train.csv')
test_data = pd.read_csv('/Users/mohsenkiskani/.kaggle/competitions/ubaar-competition/test.csv')

data      = data.dropna(axis = 0)

test_data.loc[12577, 'distanceKM']      = 52
test_data.loc[12577, 'taxiDurationMin'] = 50
test_data.loc[13853, 'distanceKM']      = 500
test_data.loc[13853, 'taxiDurationMin'] = 380

all_data = pd.concat((data, test_data)) 

all_data['source']           = all_data['sourceLatitude']*all_data['sourceLongitude']
all_data['destination']      = all_data['destinationLatitude']*all_data['destinationLongitude']

min_price = min(all_data['price'])

ntrain = data.shape[0]
ntest  = test_data.shape[0]

categorical_vars = ['date', 'SourceState', 'destinationState', 'vehicleType', 'vehicleOption']

all_data = all_data.copy()
categorical_var_encoders = {}
for var in categorical_vars:
    le = preprocessing.LabelEncoder().fit(all_data[var])
    all_data[var + '_ids']  = le.transform(all_data[var])
    all_data[var + '_ids']  = all_data[var + '_ids'].astype('int32')
    all_data.pop(var)
    categorical_var_encoders[var] = le

all_data = one_hot_encode(all_data, ['vehicleType_ids', 'vehicleOption_ids'], rem_original_cols=True)
    
train    = all_data[:ntrain]
test     = all_data[ntrain:]

X = train.drop(['ID','price'],axis=1)
y = train.price

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,destinationLatitude,destinationLongitude,distanceKM,sourceLatitude,sourceLongitude,taxiDurationMin,weight,source,destination,date_ids,...,vehicleType_ids_3,vehicleOption_ids_0,vehicleOption_ids_1,vehicleOption_ids_2,vehicleOption_ids_3,vehicleOption_ids_4,vehicleOption_ids_5,vehicleOption_ids_6,vehicleOption_ids_7,vehicleOption_ids_8
39085,36.473089,52.349822,184.0,35.700109,51.399743,199.0,21.0,1834.976428,1909.359717,124,...,1,0,0,1,0,0,0,0,0,0
30892,35.704176,51.40028,331.0,37.275731,49.584392,254.0,1.67,1848.294458,1835.204644,118,...,0,0,0,0,0,0,1,0,0,0
45277,35.699924,51.396715,447.0,32.665899,51.663805,285.0,19.0,1687.644636,1834.858819,83,...,1,0,0,0,0,0,0,0,1,0
16398,30.199563,53.182966,809.0,35.699078,51.401589,525.0,4.0,1834.989335,1606.102332,151,...,0,0,0,0,0,0,1,0,0,0
13653,27.180941,56.277756,1144.0,34.643252,50.877469,750.0,2.0,1762.56098,1529.682365,85,...,0,0,0,0,0,0,1,0,0,0


# Modelling

In [None]:
#train_bag = pd.read_pickle('train_bag-gboost-xgb-bag.pkl')

#X_train, X_val = train_test_split(train_bag.drop(['ID','y_gboost'], axis=1), 
#                                          test_size=0.33, random_state=42)


#y_train = X_train.price
#X_train = X_train.drop(['price'], axis=1)
#y_val   = X_val.price
#X_val   = X_val.drop(['price'], axis=1)

In [6]:
start_time = time.time()
GBoost = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

GBoost.fit(X_train, y_train)
print( '%.2f' % float((time.time() - start_time)/60 ) )
get_score(GBoost,X_val,y_val)
# score = 17.344001531383512

3.08


17.231427718495738

In [7]:
start_time = time.time()
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =5 , nthread = -1)

model_xgb.fit(X_train, y_train)
print( '%.2f' % float((time.time() - start_time)/60 ) )
get_score(model_xgb,X_val,y_val)

# score = 18.431812261871737

2.44


18.061770665332975

In [8]:
Bag = BaggingRegressor(n_estimators=100, max_samples=1.0, max_features=1.0, random_state=5, verbose=1)
Bag.fit(X_train, y_train)
get_score(Bag,X_val,y_val)
# 19.124410611169388

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.4s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


19.068860946763646

In [14]:
lasso = Lasso()

clf_lasso = GridSearchCV(lasso, {'alpha': [1,1e1,1e2,1e3]}, verbose=1)
clf_lasso.fit(X_train,y_train)
print(clf_lasso.best_params_)
get_score(clf_lasso,X_val,y_val)
# {'alpha': 100.0}
# 40.78453998923231

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:   22.4s finished


{'alpha': 100.0}


38.36124859141877

In [13]:
reg_lassolars  =  linear_model.LassoLars(alpha = 2)
reg_lassolars.fit(X_train, y_train)
get_score(reg_lassolars,X_val,y_val)
# 40.775030093207896

38.34742269801275

In [12]:
knn_val = KNeighborsClassifier(2)
knn_val.fit(X_train, y_train) 
get_score(knn_val,X_val,y_val)
# 32.25103227931944

32.46743416954558

In [11]:
linear_reg = LinearRegression()
linear_reg.fit(X_train,y_train)
get_score(linear_reg,X_val,y_val)
# 40.78006760576269

38.35952262552975

In [9]:
reg_dec_tree = DecisionTreeRegressor(max_depth=15)
reg_dec_tree.fit(X_train, y_train)
get_score(reg_dec_tree,X_val,y_val)
# 23.467060408266345

22.964376680861278

In [10]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.8, l1_ratio=.9, random_state=3))
ENet.fit(X_train, y_train)
get_score(ENet,X_val,y_val)

# score = 38.67009125705459

36.67135467581477

In [15]:
reg_dec_tree = DecisionTreeRegressor(max_depth=10, criterion = 'mae', max_features='sqrt',
                                     random_state=5, min_samples_leaf = 12, min_samples_split = 2)
reg_dec_tree.fit(X_train, y_train)
get_score(reg_dec_tree,X_val,y_val)
# 26.002284134401865

26.0226655271857

In [16]:
ridge = Ridge()

clf_ridge = GridSearchCV(ridge, {'alpha': [1e-2,1e-1,1,1e1,1e2]}, verbose=1)

clf_ridge.fit(X_train,y_train)
print(clf_ridge.best_params_)
get_score(clf_ridge,X_val,y_val)
# 40.78025846231952

Fitting 3 folds for each of 7 candidates, totalling 21 fits
{'alpha': 1}


[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    0.5s finished


38.358013001501675

# Stale models

In [None]:
#svr_rbf = SVR()

#clf_svr = GridSearchCV(svr_rbf,
#                   {'C': [1e-3],
#                    'kernel': ['rbf'],
#                    'gamma': [1e-3]}, verbose=1)

#clf_svr.fit(X_train,y_train)
#print(clf_svr.best_params_)
#get_score(clf_svr,X_val,y_val)

In [None]:
#make_pipeline(RobustScaler(), Lasso(alpha =1, random_state=1))
#lasso.fit(X_train, y_train)
#get_score(lasso,X_val,y_val)

# score = 53.923366658249186

In [None]:
# Computationally intensive 
#KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
#KRR.fit(X_train, y_train)
#get_score(KRR,X_val,y_val)

# score = 33.473933712491295

In [None]:
#ABoost = AdaBoostRegressor(n_estimators=1000, learning_rate=0.05, loss='linear', random_state=5)
#ABoost.fit(X_train, y_train)
#get_score(ABoost,X_val,y_val)

# score = 58.54226269060756

In [None]:
#xgb_model = xgb.XGBRegressor()
#clf = GridSearchCV(xgb_model,
#                   {'max_depth': [2,4,6],
#                    'n_estimators': [50,100,200]}, verbose=1)

#clf.fit(X_train,y_train)
#print(clf.best_params_)
#get_score(clf,X_val,y_val)

In [None]:
#RForest = RandomForestRegressor(n_estimators=20, criterion='mae', random_state = 5)
##                                max_depth=4, max_features='sqrt',
##                                min_samples_leaf=5, min_samples_split=3,
##                                random_state = 42)
#RForest.fit(X_train, y_train)
#get_score(RForest,X_val,y_val)

In [None]:
#dec_tree = DecisionTreeRegressor()

#clf_dec_tree = GridSearchCV(dec_tree, 
#                            {'criterion' : ['mae'], 
#                             'max_depth' : [2,4,6,8,10,12,14,16,18,20,22,24],
#                             'min_samples_split' : [2,4,6,8,10], 
#                             'min_samples_leaf' : [2,4,6,8,10,12,14,16,18,20,22,24],
#                             'max_features' : ['sqrt'], 
#                             'random_state' : [5]
#                            }, verbose=1)
#clf_dec_tree.fit(X_train,y_train)
#print(clf_dec_tree.best_params_)
#get_score(clf_dec_tree,X_val,y_val)

In [None]:
#xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
#                             learning_rate=0.05, min_child_weight=1.7817,
#                             reg_alpha=0.4640, reg_lambda=0.8571,
#                             subsample=0.5213, silent=1, nthread = -1)

#GBoost = GradientBoostingRegressor( learning_rate=0.05, max_features='sqrt',
#                                   min_samples_leaf=15, min_samples_split=10, 
#                                   loss='huber')

#Bag = BaggingRegressor()#base_estimator=KNeighborsRegressor())

#regressors = [xgb_model, Bag]

#stregr = StackingRegressor(regressors=regressors, meta_regressor=GBoost)

#params = {'xgbregressor__n_estimators': [2000],#[100, 200, 500, 1000],
#          'xgbregressor__max_depth': [8],#[2,4,8,16],
#          'meta-gradientboostingregressor__n_estimators': [2000],#[100, 200, 500, 1000],
#          'meta-gradientboostingregressor__max_depth': [48],#[2,4,8,16],
#          'baggingregressor__base_estimator': [KNeighborsRegressor()]}#, DecisionTreeRegressor()]}

#grid = GridSearchCV(estimator=stregr, 
#                    param_grid=params, 
#                    cv=3,
#                    refit=True)
#grid.fit(X_train, y_train)
#print(grid.best_params_)
#get_score(grid,X_val,y_val)

# Stacking

In [None]:
stacking_regressor = StackingRegressor(regressors=[model_xgb, Bag], meta_regressor = linear_reg)

# Training the stacking classifier

stacking_regressor.fit(X_train, y_train)
get_score(stacking_regressor,X_val,y_val)

In [None]:
pred1 = model_xgb.predict(X_val)
pred2 = GBoost.predict(X_val)
pred3 = Bag.predict(X_val)
score1 = mean_absolute_precision_error(y_val, pred1)
score2 = mean_absolute_precision_error(y_val, pred2)
score3 = mean_absolute_precision_error(y_val, pred3)

In [None]:
def max_pred(p_true,p1,p2,p3):
    
    min_score = 100
    w1 = 0 
    w2 = 0 
    
    for j in range(1,100):
        for i in range(j,100):
            p_pred = 0.01*i*p1+(1-0.01*i-0.01*j)*p1 + 0.01*j*p2 
            score  = mean_absolute_precision_error(y_val, p_pred)
            if score < min_score:
                min_score = score 
                w1 = 0.01*i 
                w2 = 0.01*j 
    return min_score, w1, w2

In [None]:
ms,a1,a2 = max_pred(y_val,pred1,pred2,pred3)
print(ms,a1,a2)

In [None]:
pred1 = model_xgb.predict(X_val)
pred3 = Bag.predict(X_val)
score1 = mean_absolute_precision_error(y_val, pred1)
score3 = mean_absolute_precision_error(y_val, pred3)
score  = mean_absolute_precision_error(y_val, pred)
print(score1, score3, np.mean([score1,score3]), score) 

# Other's Stacking models

#### Averaged base models class

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
averaged_models = AveragingModels(models = (model_xgb, GBoost, Bag))
averaged_models.fit(X_train, y_train)
get_score(averaged_models,X_val,y_val)


#### Stacking Averaged models Score

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                #instance.fit(X[train_index], y[train_index])
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models  = StackingAveragedModels(base_models = (model_xgb, Bag),meta_model = linear_reg)
stacked_averaged_models.fit(X_train, y_train)
get_score(stacked_averaged_models,X_val,y_val)

# Trying a new model for each Vehicle Type

In [None]:
vTypes = set(all_data['vehicleType_ids'])
all_vType     = {}
final_models  = {}

for i in vTypes:
    tmp_vType = all_data[all_data['vehicleType_ids'] == i]
    #tmp_vType = tmp_vType.drop(['vehicleType_ids'], axis=1)
    all_vType[i] = tmp_vType
    start_time = time.time()
    vType_train = tmp_vType[tmp_vType['price'].notnull()]
    vType_test  = tmp_vType[tmp_vType['price'].isnull()].drop(['price'], axis=1)
    X = vType_train.drop(['ID','price', 'vehicleType_ids'],axis=1)
    y = vType_train['price']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

    if i != 1:
        minSampleLeaf = 20 
    else:
        minSampleLeaf = 15
    
    if i != 3:
        nEstimators = 1000
        minSampleSplit = 4
    else:
        nEstimators = 1500
        minSampleSplit = 2
    
    GBoost = GradientBoostingRegressor(learning_rate=0.1, max_features='sqrt', loss='huber',
                                       min_samples_leaf=minSampleLeaf, min_samples_split=minSampleSplit, 
                                       n_estimators=nEstimators, max_depth=8, alpha = 0.7)
    
    GBoost.fit(X_train,y_train)
    
    print("%.2f" % float((time.time() - start_time)/60 ) +" mins, vType: ",i, ", items: ", tmp_vType.shape[0],
          ", loss: ", "%.2f" % get_score(GBoost,X_val,y_val))
    
    GB_Final = GradientBoostingRegressor(learning_rate=0.1, max_features='sqrt', loss='huber',
                                         min_samples_leaf=minSampleLeaf, min_samples_split=minSampleSplit, 
                                         n_estimators=nEstimators, max_depth=8, alpha = 0.7)
    
    GB_Final.fit(X,y)
    final_models[i] = GB_Final  
    
    #clf_GBoost = GridSearchCV(GBoost, 
    #                      {
    #                       'min_samples_split': [2,4],
    #                       'n_estimators': [1000,1500], 
    #                       'max_depth': [4,8],
    #                       'min_samples_leaf': [20,15],
    #                      },  verbose=2)

    #clf_GBoost.fit(X_train,y_train)
    #, best model: ", GBoost.get_params)# clf_GBoost.best_params_) 

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission15.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")        
    
    for i in final_models.keys():
        tmp_test0            = test[test['vehicleType_ids']==i]
        tmp_test1            = tmp_test0.drop(['ID', 'price', 'vehicleType_ids'], axis=1)
        tmp_model            = final_models[i]
        tmp_test0['y_pred']  = tmp_model.predict(tmp_test1)
    
        for j in range(tmp_test0.shape[0]):
            y_pred_test = tmp_test0.iloc[j]['y_pred'] 
            if y_pred_test < 0:
                y_pred_test = min_price 
            outputfile.write(str(int(tmp_test0.iloc[j]['ID'] ))+","+str(int(np.ceil(y_pred_test)))+"\n")

# Trying a new model for each Vehicle Type and Vehicle Option ?

In [None]:
all_cats2 = all_data[['vehicleType_ids', 'vehicleOption_ids']].as_matrix().tolist()
all_cats2 = [(x[0],x[1]) for x in all_cats2 ]
all_cats2 = set(all_cats2)
all_data_cat2 = {}

for item in all_cats2:
    tmp_data1 = all_data[all_data['vehicleType_ids'] == item[0]]
    tmp_data2 = tmp_data1[tmp_data1['vehicleOption_ids'] == item[1]]
    tmp_data = tmp_data2.drop(['vehicleType_ids', 'vehicleOption_ids'], axis=1)
    all_data_cat2[item] = tmp_data
    #print(item, tmp_data.shape)

In [None]:
all_data_cat_shape2 = {}
for x in all_data_cat2.keys():
    
    cat_df2    = all_data_cat2[x]
    cat_train2 = cat_df2[cat_df2['price'].notnull()]
    cat_test2  = cat_df2[cat_df2['price'].isnull()].drop(['price'], axis=1)
    
    all_data_cat_shape2[x] = (cat_train2.shape[0], cat_test2.shape[0])

In [None]:
test_sum = pd.DataFrame() 
for item in  [(1,6)]: # all_cats2: 
    start_time = time.time()
    
    cat_df2    = all_data_cat2[item]
    cat_train2 = cat_df2[cat_df2['price'].notnull()]
    cat_test2  = cat_df2[cat_df2['price'].isnull()].drop(['price'], axis=1)

    X = cat_train2.drop(['ID','price'],axis=1)
    y = cat_train2['price']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

    #gboost_base = GradientBoostingRegressor()
    #GBoost      = GridSearchCV(gboost_base,
    #                   {'max_depth': [2,4,6,8,10,12],
    #                    'n_estimators': [50,100,200,500,1000]}, verbose=1)



    GBoost = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05,
                                       max_depth=6, max_features='sqrt',
                                       min_samples_leaf=8, min_samples_split=4, 
                                       loss='huber', random_state =5)

    GBoost.fit(X_train, y_train)
    #print(GBoost.best_params_)
    #'destinationLatitude','destinationLongitude','sourceLatitude','sourceLongitude',
    print(item, cat_df2.shape[0], "%.2f" % get_score(GBoost,X_val,y_val),
          "%.2f" % float((time.time() - start_time)/60 )) 


#cat_test['pred_price'] = GBoost.predict(cat_test.drop(['ID'], axis=1))
#cat_test['pred_price'] = cat_test['pred_price'].apply((lambda x: max(x, min_price) ))
#test_sum = pd.concat([test_sum,cat_test])

#test_sum.shape

# Final model

In [None]:
# Current best model 
start_time = time.time()
Final = GradientBoostingRegressor(n_estimators=2200, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
Final.fit(train.drop(['ID','price'],axis=1), train.price)
y_pred_test = Final.predict(test.drop(['ID','price'],axis=1))
print( '%.2f' % float((time.time() - start_time)/60 ) )

# Save to file

In [None]:
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission24.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(y_pred_test.shape[0]):
        if y_pred_test[i] < 0:
            y_pred_test[i] = 100000 
        outputfile.write(str(test_data.ID[i])+","+str(int(np.ceil(y_pred_test[i])))+"\n")

### Deal with high skewness

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)

#print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
#print(skewness.head(10))

skewness = skewness[abs(skewness) > 0.75]
#print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)
#all_data[skewed_features] = np.log1p(all_data[skewed_features])

# Plots 

In [None]:
fig1, (ay1, ay2) = plt.subplots(2, 1)


ay1.scatter(x = X_train['sourceLatitude'], y = y_train, marker = "+")
ay1.set_title('Price vs sourceLatitude')
ay1.set_xlabel('sourceLatitude')
ay1.set_ylabel('Price')

ay2.scatter(x = X_train['sourceLongitude'], y = y_train, marker = "+")
ay2.set_title('Price vs sourceLongitude')
ay2.set_xlabel('sourceLongitude')
ay2.set_ylabel('Price')

fig1.set_size_inches(28.5, 10.5)
fig1.savefig("/Users/mohsenkiskani/Downloads/Ubaar/plots/sourceEffects.png", dpi=100)
plt.show()

In [None]:
fig2, (ay3, ay4) = plt.subplots(2, 1)


ay3.scatter(x = X_train['destinationLatitude'], y = y_train, marker = "+")
ay3.set_title('Price vs destinationLatitude')
ay3.set_xlabel('destinationLatitude')
ay3.set_ylabel('Price')

ay4.scatter(x = X_train['destinationLongitude'], y = y_train, marker = "+")
ay4.set_title('Price vs destinationLongitude')
ay4.set_xlabel('destinationLongitude')
ay4.set_ylabel('Price')

fig2.set_size_inches(28.5, 10.5)
fig2.savefig("/Users/mohsenkiskani/Downloads/Ubaar/plots/destinationEffects.png", dpi=100)
plt.show()

In [None]:
fig3, (ay3, ay4, ay5) = plt.subplots(3, 1)


ay3.scatter(x = X_train['distanceKM'], y = y_train, marker = "+")
ay3.set_title('Price vs distanceKM')
ay3.set_xlabel('distanceKM')
ay3.set_ylabel('Price')

ay4.scatter(x = X_train['taxiDurationMin'], y = y_train, marker = "+")
ay4.set_title('Price vs taxiDurationMin')
ay4.set_xlabel('taxiDurationMin')
ay4.set_ylabel('Price')

ay5.scatter(x = X_train['weight'], y = y_train, marker = "+")
ay5.set_title('Price vs weight')
ay5.set_xlabel('weight')
ay5.set_ylabel('Price')

fig3.set_size_inches(28.5, 10.5)
fig3.savefig("/Users/mohsenkiskani/Downloads/Ubaar/plots/distance-time-Effects.png", dpi=100)
plt.show()

In [None]:
corrmat = data.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
sns.distplot(y_train , fit=norm);
(mu, sigma) = norm.fit(y_train)

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(y_train, plot=plt)
plt.show()

In [None]:
y_train_lp = np.log1p(y_train)

sns.distplot(y_train_lp, fit=norm);
(mu, sigma) = norm.fit(y_train_lp)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(y_train_lp, plot=plt)
plt.show()

In [None]:
all_dates = data['date'].unique()
date_price_vec = []

for date in all_dates:
    date_price_vec.append(np.mean(data.query('date == ' + str(date) ).price))
    
states = all_data['SourceState_ids'].unique()
source_price_vec = []
destin_price_vec = []

for state in states:
    source_price_vec.append(np.mean(data.query('date == ' + str(state) ).price))
    destin_price_vec.append(np.mean(data.query('date == ' + str(state) ).price))

In [None]:
fig1, (ay1, ay2, ay3, ay4, ay5) = plt.subplots(5, 1)


ay1.scatter(data.vehicleType, data.price, marker = "+")
ay1.plot([0, 1, 2, 3], [np.mean(data.query('vehicleType == 0').price), np.mean(data.query('vehicleType == 1').price),
         np.mean(data.query('vehicleType == 2').price), np.mean(data.query('vehicleType == 3').price)], 'r--')

ay1.set_title('Price vs vehicle type')
ay1.set_xlabel('Vehicle Type')
ay1.set_ylabel('Price')


ay2.scatter(data.vehicleOption, data.price, marker = "+")
ay2.plot([0, 1, 2, 3, 4, 5, 6, 7, 8], [np.mean(data.query('vehicleOption == 0').price), 
                                       np.mean(data.query('vehicleOption == 1').price),
                                       np.mean(data.query('vehicleOption == 2').price), 
                                       np.mean(data.query('vehicleOption == 3').price), 
                                       np.mean(data.query('vehicleOption == 4').price), 
                                       np.mean(data.query('vehicleOption == 5').price),
                                       np.mean(data.query('vehicleOption == 6').price),
                                       np.mean(data.query('vehicleOption == 7').price), 
                                       np.mean(data.query('vehicleOption == 8').price)], 'r--')


ay2.set_title('Price vs vehicle option')
ay2.set_xlabel('Vehicle Option')
ay2.set_ylabel('Price')

ay3.scatter(data.date, data.price, marker = "+")
ay3.plot(all_dates, date_price_vec, 'r--')
ay3.set_title('Price vs date')
ay3.set_xlabel('Date')
ay3.set_ylabel('Price')

ay4.scatter(data.SourceState, data.price, marker = "+")
ay4.plot(states, source_price_vec, 'r--')
ay4.set_title('Price vs source state')
ay4.set_xlabel('Date')
ay4.set_ylabel('Price')

ay5.scatter(data.destinationState, data.price, marker = "+")
ay5.plot(states, destin_price_vec, 'r--')
ay5.set_title('Price vs destination state')
ay5.set_xlabel('Date')
ay5.set_ylabel('Price')

fig1.set_size_inches(28.5, 10.5)
fig1.savefig("/Users/mohsenkiskani/Downloads/Ubaar/plots/categoryEffects.png", dpi=100)
plt.show()

# Which features are most important?

In [None]:
for col in X_train.columns:
    plt.figure(figsize=(16,6))
    plt.scatter(train[col], train['price'])
    plt.title('Price vs '+ col)
    plt.xlabel(col)
    plt.ylabel('Price')
    plt.show()

In [None]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(train['destinationLatitude'], train['destinationLongitude'], train['price'], c='r', marker='o')

In [None]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(train['sourceLatitude'], train['sourceLongitude'], train['price'], c='g', marker='d')

In [None]:


#all_data['destination_tuple'] = all_data['destination_tuple'].apply(lambda x: (int(x[0], int(x[1]))))
#for i in range(all_data.shape[0]):
#    x = all_data['destinationLatitude'].iloc[i]
#    y = all_data['destinationLongitude'].iloc[i]
    
#    lati_bucket = x // d_lati
#    long_bucket = y // d_long
    
#    all_data['destination_tuple'].iloc[i] = (lati_bucket, long_bucket)

In [None]:
all_data.head()