In [192]:
import pandas as pd
import numpy as np
import os
path = os.getcwd()


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score

In [193]:
train = pd.read_csv(path + "\\TRAIN.csv", index_col = 'index')
test = pd.read_csv(path + "\\TEST.csv", index_col = 'index')
ss = pd.read_csv(path + "\\sample_submission.csv")

In [194]:
# Test Index for Uber and lyft
uber_index = test[test['cab_provider'] == 'Uber'].index
lyft_index = test[test['cab_provider'] == 'Lyft'].index

In [195]:
train['time_stamp'] = pd.to_datetime(train['time_stamp'], unit = 'ms')
test['time_stamp'] = pd.to_datetime(test['time_stamp'], unit = 'ms')

In [196]:
train["hour"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.hour
train["minute"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.minute
train["day"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.day

test["hour"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.hour
test["minute"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.minute
test["day"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.day

In [197]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit = 'ms')

In [198]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
le = LabelEncoder()
sl = MinMaxScaler()

In [199]:
dict = {'Theatre District': 1, 'Fenway': 2, 'Beacon Hill': 3,
        'North End': 4, 'Northeastern University': 5, 'Financial District': 6,
        'Boston University': 7, 'Haymarket Square': 8, 'West End': 9,
        'South Station': 10, 'North Station': 11, 'Back Bay': 12}

df['source'] = df['source'].map(dict)
df['destination'] = df['destination'].map(dict) 

In [200]:
label_cols = ['cab_provider', 'cab_type']

df[label_cols] = df[label_cols].apply(le.fit_transform)

## Cab Provider = 0(lyft)

In [201]:
lyft_size = train[train['cab_provider'] == 'Lyft'].shape[0]

In [202]:
df_lyft = df[df['cab_provider'] == 0].copy()

In [203]:
train_lyft, test_lyft = df_lyft[:lyft_size], df_lyft[lyft_size:].reset_index(drop=True)

In [204]:
target = 'fare'
time = 'time_stamp'
time_feats = ['hour', 'year', 'minute', 'day','dayofweek', 'month', 'weekday']
provider = 'cab_provider'

features = [col for col in df.columns if col not in [target, time, provider]]

In [205]:
from sklearn.model_selection import train_test_split, StratifiedKFold

trn, val = train_test_split(train_lyft, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_lyft[features]

In [206]:
%%time
lgb = XGBRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 1.3741850942311564
Wall time: 753 ms


In [207]:
%%time

xgb = XGBRegressor(random_state = 1999)

xgb.fit(train_lyft[features], train_lyft[target])
preds = xgb.predict(X_test)
preds = np.abs(preds)

#error = np.sqrt(mean_squared_error(y_val, preds))

#print(f'mean_squared_log_error is : {error}')

Wall time: 841 ms


In [208]:
test['Predictions'] = None
test.loc[lyft_index, 'Predictions'] = preds

---
## Uber-->1

In [209]:
uber_size = train[train['cab_provider'] == 'Uber'].shape[0]

In [210]:
df_uber = df[df['cab_provider'] == 1].copy()

In [211]:
train_uber, test_uber = df_uber[:uber_size], df_uber[uber_size:].reset_index(drop=True)

In [212]:
from sklearn.model_selection import train_test_split, StratifiedKFold

trn, val = train_test_split(train_uber, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_uber[features]

In [232]:
%%time
lgb = XGBRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 1.8069016905854733
Wall time: 729 ms


In [214]:
%%time

xgb = XGBRegressor(random_state = 1999)

xgb.fit(train_uber[features], train_uber[target])
preds = xgb.predict(X_test)
preds = np.abs(preds)

#error = np.sqrt(mean_squared_error(y_val, preds))

#print(f'mean_squared_log_error is : {error}')

Wall time: 801 ms


In [189]:
test.loc[uber_index, 'Predictions'] = preds

In [190]:
index = [i for i in range(25000)]
d = list(zip(index, test['Predictions']))
ss = pd.DataFrame(d, columns=['index', target])


ss.to_csv(path + "\\baseline_seperate.csv", index = False)