# **IMPORTING LIBRARIES**

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import warnings; warnings.simplefilter('ignore')

# **IMPORTING DATASET**

In [0]:

data=pd.read_csv("/content/drive/My Drive/train.csv") #train set
test=pd.read_csv("/content/drive/My Drive/train.csv") #test set

In [9]:
data.head() #checking whether imported successfullly
data.shape

(1048575, 17)

In [10]:
data.isnull().sum() #checking for null values

ID                    0
vendor_id             0
pickup_loc            1
drop_loc              1
driver_tip            2
mta_tax               1
distance              1
pickup_time           1
drop_time             1
num_passengers        1
toll_amount           2
payment_method        1
rate_code             1
stored_flag           1
extra_charges         1
improvement_charge    2
total_amount          2
dtype: int64

# **PreProcessing**

In [0]:
data['pickup_time']=pd.to_datetime(data['pickup_time'])
data['drop_time']=pd.to_datetime(data['drop_time'])

In [0]:
data['trip_duration']=data['drop_time']-data['pickup_time']

In [0]:
time=pd.DatetimeIndex(data['trip_duration'])
data['trip_duration']=time.hour*60+time.minute

In [0]:
del data['pickup_time']
del data['drop_time']
del data['ID']
del data['vendor_id']
del data['drop_loc']
del data['pickup_loc']
del data['stored_flag']
del data['mta_tax']
del data['improvement_charge']

In [0]:
data.dropna(inplace=True)

In [16]:
data.isnull().sum()
data.shape

(1048573, 9)

In [0]:
id_vec = np.array(test.loc[:,test.columns == 'ID'])


In [0]:
y=data['total_amount']
del data['total_amount']
x=data

# Building Model

In [19]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.3,random_state=42)
model=RandomForestRegressor(random_state=42) #Random Forest Regressor
model.fit(train_x,train_y)
model.score(train_x,train_y)

0.9412083827429485

In [20]:
model.score(test_x,test_y)

0.9006328326827558

In [21]:
model1=GradientBoostingRegressor(alpha=0.1,max_depth=3,random_state=42) #Final model For prediction
model1.fit(train_x,train_y)

GradientBoostingRegressor(alpha=0.1, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [22]:
model1.score(train_x,train_y)

0.8415652110398391

In [23]:
model1.score(test_x,test_y)

0.9505131553615961

# **Tuning And Improving**

In [0]:
GBM=GradientBoostingRegressor(random_state=42)
gb_param_grid = {'n_estimators' : [100,500],
                'learning_rate': [0.1, 0.01],
                 'max_depth': [3,8],
                'max_features': [0.3, 0.1] 
                }
gsGBM = GridSearchCV(GBM,param_grid = gb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs= 4, verbose = 1)
gsGBM.fit(train_x,train_y)

GBM_best = gsGBM.best_estimator_

In [0]:
GBM_best.score(train_x,train_y)

In [0]:
GBM_best.score(test_x,test_y)

# Preparing Test Set for predictions

In [25]:
test.isnull().sum()

ID                    0
vendor_id             0
pickup_loc            1
drop_loc              1
driver_tip            2
mta_tax               1
distance              1
pickup_time           1
drop_time             1
num_passengers        1
toll_amount           2
payment_method        1
rate_code             1
stored_flag           1
extra_charges         1
improvement_charge    2
total_amount          2
dtype: int64

In [0]:
test['pickup_time']=pd.to_datetime(test['pickup_time'])
test['drop_time']=pd.to_datetime(test['drop_time'])

In [0]:
test['trip_duration']=test['drop_time']-test['pickup_time']

In [0]:
time=pd.DatetimeIndex(test['trip_duration'])
test['trip_duration']=time.hour*60+time.minute

In [0]:
del test['pickup_time']
del test['drop_time']
del test['ID']
del test['vendor_id']
del test['drop_loc']
del test['pickup_loc']
del test['stored_flag']
del test['mta_tax']
del test['improvement_charge']

In [0]:
test.shape

(154235, 8)

# **Predictions**

In [0]:
predict = model1.predict(test)
# predict = predict.reshape(predict.shape[0], 1)
# predict = np.concatenate([id_vec, predict], axis=1)
# predict = pd.DataFrame(data=predict, columns=['ID', 'total_amount'])
# predict.to_csv("answer.csv", index=False, header=True)

In [0]:
print(predict)

[ 7.11729064 14.57231341 16.56904596 ...  9.08122584 14.45557539
 16.07017625]


In [0]:
predict = pd.DataFrame(data=predict)
predict.to_csv("answer.csv", index=False, header=True)

In [0]:
res = model.predict(np.array([[]]))
print(res)