In [1]:
import pandas as pd
import numpy as np
import os
path = os.getcwd()

In [2]:
train = pd.read_csv(path + "\\TRAIN.csv")
test = pd.read_csv(path + "\\TEST.csv")
ss = pd.read_csv(path + "\\sample_submission.csv")

---

In [3]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)

In [4]:
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit = 'ms')

In [194]:
df.drop('index', inplace=True, axis = 1)

---

In [195]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
le = LabelEncoder()
sl = MinMaxScaler()

In [196]:
df["hour"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.hour
df["year"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.year
df["minute"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.minute
df["day"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.day
df["dayofweek"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.dayofweek
df["month"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.month
df["weekday"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.weekday

In [197]:
target = 'fare'
time = 'time_stamp'
time_feats = ['hour', 'year', 'minute', 'day','dayofweek', 'month', 'weekday']

features = [col for col in df.columns if col not in [target, time]]

In [198]:
features = ['cab_provider',
 'source',
 'destination',
 'distance',
 'surge_multiplier',
 'cab_type',
 'hour',
 'year',
 'minute',
 'day',
 'dayofweek',
 'month',
 'weekday']

In [199]:
label_cols = ['cab_provider', 'source','destination', 'cab_type']

df[label_cols] = df[label_cols].apply(le.fit_transform)

In [200]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop=True)

In [201]:
from sklearn.model_selection import train_test_split, StratifiedKFold

trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]


In [202]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score


In [203]:
%%time
lgb = LGBMRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 1.7691670385560418
Wall time: 646 ms


In [204]:
%%time
lgb = LGBMRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 1.7691670385560418
Wall time: 439 ms


In [205]:
%%time

xgb = XGBRegressor(random_state = 1999)

xgb.fit(train_proc[features], train_proc[target])
preds = xgb.predict(X_test)
preds = np.abs(preds)

#error = np.sqrt(mean_squared_error(y_val, preds))

#print(f'mean_squared_log_error is : {error}')

Wall time: 2.14 s


In [206]:
# Cross Validation for Boosting
def cross_val(regressor, train, test, features, name):
    N_splits = 7
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        if name != 'cat':
            #### Scaling Data ####
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

In [207]:
%%time

xgb_oofs, xgb_preds = cross_val(xgb, train_proc, test_proc, features, 'xgb')



 Root Log Mean Squared Error for Validation set is : 1.7355348565691844


 Root Log Mean Squared Error for Validation set is : 1.632135031440744


 Root Log Mean Squared Error for Validation set is : 1.6719615570709805


 Root Log Mean Squared Error for Validation set is : 1.6074971791499562


 Root Log Mean Squared Error for Validation set is : 1.6071691541411763


 Root Log Mean Squared Error for Validation set is : 1.6276715167984774


 Root Log Mean Squared Error for Validation set is : 1.598347532950827

\Root Log Mean Squared Error for oofs is 1.6406655134246648
Wall time: 14.5 s


In [208]:
index = [i for i in range(25000)]
d = list(zip(index, xgb_preds))
ss = pd.DataFrame(d, columns=['index', target])


ss.to_csv(path + "\\basline_xgb.csv", index = False)

In [158]:
df['weekday'].unique()

array([0, 1, 2], dtype=int64)

In [159]:
df['dayofweek'].unique()

array([0, 1, 2], dtype=int64)

In [185]:
df["week"] = pd.to_datetime(df.time_stamp, format="%Y-%m-%d").dt.minute

In [188]:
df['week'].unique()

array([40, 41, 43, 49, 55,  3, 13, 14, 17, 28, 29, 36, 50, 15, 16, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 30, 11, 18, 38, 46, 34,  5,  8, 32, 35,
       44, 47, 53, 56, 59,  2, 42, 45, 48, 51, 54, 57,  0,  6,  9, 12, 33,
       39, 58, 31, 37,  1], dtype=int64)