In [265]:
import pandas as pd
import numpy as np
import os
path = os.getcwd()


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score

In [266]:
train = pd.read_csv(path + "\\TRAIN.csv", index_col = 'index')
test = pd.read_csv(path + "\\TEST.csv", index_col = 'index')
ss = pd.read_csv(path + "\\sample_submission.csv")

In [267]:
# Test Index for Uber and lyft
uber_index = test[test['cab_provider'] == 'Uber'].index
lyft_index = test[test['cab_provider'] == 'Lyft'].index

In [268]:
train['time_stamp'] = pd.to_datetime(train['time_stamp'], unit = 'ms')
test['time_stamp'] = pd.to_datetime(test['time_stamp'], unit = 'ms')

In [269]:
train["hour"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.hour
train["minute"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.minute
train["day"] = pd.to_datetime(train.time_stamp, format="%Y-%m-%d").dt.day

test["hour"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.hour
test["minute"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.minute
test["day"] = pd.to_datetime(test.time_stamp, format="%Y-%m-%d").dt.day

In [270]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit = 'ms')

In [271]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
le = LabelEncoder()
sl = MinMaxScaler()

In [272]:
dict_ = {'Lux Black XL':1, 'Black':2, 'UberX':3, 'Lyft':4, 'UberXL':5,
 'WAV':6,'Shared':7, 'Lux':8, 'Lux Black':9, 'Black SUV':10, 'UberPool':11, 'Lyft XL':12}

df['cab_type'] = df['cab_type'].map(dict_) 

In [273]:
dict_ = {'Theatre District': 1, 'Fenway': 2, 'Beacon Hill': 3,
        'North End': 4, 'Northeastern University': 5, 'Financial District': 6,
        'Boston University': 7, 'Haymarket Square': 8, 'West End': 9,
        'South Station': 10, 'North Station': 11, 'Back Bay': 12}

df['source'] = df['source'].map(dict_)
df['destination'] = df['destination'].map(dict_) 

In [274]:
label_cols = ['cab_provider']

df[label_cols] = df[label_cols].apply(le.fit_transform)

In [275]:
lyft_size = train[train['cab_provider'] == 'Lyft'].shape[0]
df_lyft = df[df['cab_provider'] == 0].copy()


In [276]:
train_lyft, test_lyft = df_lyft[:lyft_size], df_lyft[lyft_size:].reset_index(drop=True)

In [277]:
target = 'fare'
time = 'time_stamp'
time_feats = ['hour', 'year', 'minute', 'day','dayofweek', 'month', 'weekday']
provider = 'cab_provider'

features = [col for col in df.columns if col not in [target, time, provider]]

In [278]:
from sklearn.model_selection import train_test_split, StratifiedKFold

trn, val = train_test_split(train_lyft, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_lyft[features]

In [279]:
%%time

xgb = XGBRegressor(random_state = 1999)

xgb.fit(train_lyft[features], train_lyft[target])
preds = xgb.predict(X_test)
preds = np.abs(preds)

#error = np.sqrt(mean_squared_error(y_val, preds))

#print(f'mean_squared_log_error is : {error}')

Wall time: 1.18 s


In [280]:
test['Predictions'] = None
test.loc[lyft_index, 'Predictions'] = preds

---
# Uber

In [281]:
uber_size = train[train['cab_provider'] == 'Uber'].shape[0]
df_uber = df[df['cab_provider'] == 1].copy()

In [282]:
uber_type = list(df_uber['cab_type'].unique())

In [283]:
train_uber, test_uber = df_uber[:uber_size], df_uber[uber_size:]

In [284]:
features = ['source',
 'destination',
 'distance',
 'surge_multiplier',
 'hour',
 'minute',
 'day']

In [285]:
ls =['Black', 'UberX', 'UberXL', 'WAV', 'Black SUV', 'UberPool']

In [286]:
for cab_type, key in zip(uber_type, ls):
    print(f'Cab_Type: {cab_type}')
    
    test_index = test[(test['cab_provider'] == 'Uber') & (test['cab_type'] == key)].index
    
    lgb = LGBMRegressor(random_state=1999)
    
    lgb.fit(train_uber[train_uber['cab_type'] == cab_type][features], train_uber[train_uber['cab_type'] == cab_type][target], verbose = 100)
    
    preds = lgb.predict(test_uber[test_uber['cab_type'] == cab_type][features])
    preds = np.abs(preds)
    
    test.loc[test_index, 'Predictions'] = preds

Cab_Type: 2
Cab_Type: 3
Cab_Type: 5
Cab_Type: 6
Cab_Type: 10
Cab_Type: 11


In [287]:
index = [i for i in range(25000)]
d = list(zip(index, test['Predictions']))
ss = pd.DataFrame(d, columns=['index', target])


ss.to_csv(path + "\\baseline_seperate.csv", index = False)