In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Viva Notebook 
- 21F1005856
- Harish Sahadev M

# Imports

In [None]:
# Plotting graphs

import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
# Sklearn imports

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# ignore warning
import warnings
warnings.filterwarnings('ignore')

# Dataset Overview
This dataset presents an opportunity to construct predictive models aimed at estimating the total amount paid by travelers for taxi journeys. With access to a training set containing the target variable `total_amount` along with various informative features, participants are challenged to create accurate predictive models.

## Data Files
The dataset is composed of the following files:

`train.csv`: The training set, which includes the target variable `total_amount` and accompanying feature attributes.

`test.csv`: The test set, containing similar feature attributes but without the target variable 'total_amount,' as it is the variable to be predicted.

`sample_submission.csv`: A sample submission file provided in the correct format for competition submissions.

# Reading Train Data

In [None]:
df = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')
df.head()

In [None]:
print("Shape of dataset:-", df.shape, "\n--------")

print("Info:-")
df.info()

## Inference from Train Data
- Shape : (175000, 17) - 17 features and 1,75,000 entries
- `VendorID`, `PULocationID`, `DOLocationID` are integer values and contains no null values
- `tpep_pickup_datetime` and `tpep_dropoff_datetime` are of type object with no null values, but from inital observation they are datetime values. Hence they need to converted to the same while pre-processing
- `passenger_count`, `trip_distance`, `RatecodeID`, `extra`, `tip_amount`, `tolls_amount`, `improvement_surcharge`, `total_amount`, `congestion_surcharge` and `Airport_fee` are float types and some of them contains null values.
- `store_and_fwd_flag` and `payment_type` are of type object. They appear to be categorical variables with `store_and_fwd_flag` having some null values.

## Columns Description

The dataset comprises various columns, each offering valuable insights into taxi rides. Notably:

`total_amount`: The total amount paid by the traveler for the taxi ride.

`VendorID`: An identifier for taxi vendors.

`tpep_pickup_datetime` and `tpep_dropoff_datetime`: Timestamps indicating pickup and dropoff times.

`passenger_count`: The number of passengers during the ride.

`trip_distance`: The distance traveled during the trip.

`RatecodeID`: Rate code for the ride.

`store_and_fwd_flag`: A flag indicating whether the trip data was stored and forwarded.

`PULocationID` and `DOLocationID`: Pickup and dropoff location identifiers.

`payment_type`: Payment type used for the ride.

Other columns are self-explanatory and contribute to the modeling process.

# Sample Data

In [None]:
sample_data = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/sample.csv.csv')
sample_data.head()

Sample data has total_amount which is float, implying this to be a regression problem

# Exploratory Data Analysis (EDA)

In [None]:
print("Description: \n")
df.describe()

## Label
- `total_amount`

In [None]:
print(df.total_amount.describe())
df.total_amount.hist(range=[0, 50])

Majority of `total_amount` appears to be between 10 and 30

## Features

- **Numeric Features :** `VendorID`, `passenger_count`, `trip_distance`, `RatecodeID`, `PULocationID`, `DOLocationID`, `extra`, `tip_amount`, `tolls_amount`, `improvement_surcharge`, `congestion_surcharge`, `Airport_fee`
- **Categorical Features :** `store_and_fwd_flag`, `payment_type`
- Of type **DateTime** : `tpep_pickup_datetime`, `tpep_dropoff_datetime`

## Duplicate Check

In [None]:
print("No. of duplicate values in dataset is:",df.duplicated().sum())

## Ploting histogram for numeric features

In [None]:
df.hist(bins=50, color='blue', figsize=(20,15))
plt.show()

## Analyzing categorical features

### `store_and_fwd_flag`

In [None]:
sns.catplot(data=df, x='store_and_fwd_flag', kind='count', palette='Set1')

It can be observed that `N` is the most occuring flag

### `payment_type`

In [None]:
sns.catplot(data=df, x='payment_type', kind='count', palette='Set2')

Credit card is the most used payment medium

## Visualization using Box-Plot

In [None]:
df.plot(kind='box', subplots=True, color='blue', figsize=(16,10))
plt.subplots_adjust(wspace=3)
plt.show()

- It can be observed that `trip_distance` has outliers with trip distance more than 1000
- Similarly `tip_amount` can be observed to have outlier with amount greater than 200

## Correlation

In [None]:
# Relationship between columns - correlation
corr_matrix = df.corr(numeric_only=True)

# Heatmap
plt.figure(figsize=(14,7))
sns.heatmap(corr_matrix, annot=True)

In [None]:
# Correlation of label to all features
corr_matrix['total_amount']

# Feature Engineering

## Data Cleaning

### Pick_up (`tpep_pickup_datetime`) and Drop_off time (`tpep_dropoff_datetime`) columns

These are object type, for the cleaning process the following is done:
* First convert them to datetime
* Extract time duration
* Extract Day

In [None]:
# Copy the dataset
data = df.copy()
data.head()

In [None]:
# Converting to pandas datetime
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])

In [None]:
data['tpep_dropoff_datetime'].info()

In [None]:
data['tpep_pickup_datetime'].info()

Adding a new feature `Duration_mins` which is difference between drop-off time and pick-up time
- Here there are some entries where pick-up time is higher than drop-off time which is not a possibility
- To overcome that, the higher is always subtracted from the lower

In [None]:
# Code to create a new column `Duration_mins` which denotes total duration in mins

data["Duration_mins"] = data.tpep_dropoff_datetime > data.tpep_pickup_datetime 
n = len(data["Duration_mins"])
for i in range(n):
  if data["Duration_mins"][i] == True:
    data["Duration_mins"][i] = round((data.tpep_dropoff_datetime[i] - data.tpep_pickup_datetime[i]).seconds / 60)
  else:
    data["Duration_mins"][i] = round((data.tpep_pickup_datetime[i] - data.tpep_dropoff_datetime[i]).seconds / 60)

In [None]:
# Converting Duration_mins to int type
data['Duration_mins'] = data['Duration_mins'].astype('int')
data.tail()

Adding another feature `Pickup_day` which denotes the day of pick-up

In [None]:
data["Pickup_day"] = data['tpep_pickup_datetime'].dt.day_name()
data.head()

In [None]:
data.Pickup_day.value_counts()

## Dropping columns
* `tpep_pickup_datetime`
* `tpep_dropoff_datetime`
* `store_and_fwd_flag` - This feature is dropped because it indicates whether the trip data was stored and forwarded or not. It's majority of values are `N` and thus does not add much value to the target variable `total_amount`

In [None]:
data.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag'], axis=1, inplace=True)
data.head()

## Removing Outliers

- Removing values where `trip_distance` is more than 1000
- Removing values where `tip_amount` us more than 200

In [None]:
data = data[data['trip_distance'] < 1000] 
data = data[data['tip_amount'] < 200] 
data.shape

In [None]:
data.plot(kind='box', subplots=True, color='blue', figsize=(16,10))
plt.subplots_adjust(wspace=3)
plt.show()

# Splitting data

- Features = X 
- Labels = y

In [None]:
# Splitting features and label
X = data.drop('total_amount', axis=1)
y = data['total_amount']
X.shape, y.shape

## `train_test_split`
20% validation set and `random_state`=42

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Checking for missing/nan values

In [None]:
X_train.isnull().sum()

- There are null values in `passenger_count`, `RatecodeID`, `congestion_surcharge` and `Airport_fee` which are all numeric features
- Will use `SimpleImputer` with median strategy to deal with the null values in Pipeline
- Categorical features does not have any null values

In [None]:
print(X_train.passenger_count.unique())
print(X_train.RatecodeID.unique())
print(X_train.congestion_surcharge.unique())
print(X_train.Airport_fee.unique())

## Seperating numeric and categorical features

In [None]:
num_attributes = X_train.select_dtypes(include=[np.number]).columns.values
cat_attributes = X_train.select_dtypes(exclude=[np.number]).columns.values
print("Numeric attributes:", num_attributes, "\nCategorical Attributes:", cat_attributes)

# Pipeline
- Pipeline applies `SimpleImputer` to numeric features followed by `StandardScaler`
- Categoricals features `OneHotEndoer` is applied
- Combine both pipelines using ColumnTransformer

In [None]:
# Numeric Pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('std_scaler', StandardScaler())])

# Categorical Pipeline
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('one_hot', OneHotEncoder(handle_unknown='ignore'))])

# Cobining both numeric and categorical pipeline together using ColumnTransformer
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes),
])

# Transforming X_train
X_train_transformed = full_pipeline.fit_transform(X_train)

In [None]:
full_pipeline

In [None]:
# Transforming Validation set
X_test_transformed = full_pipeline.transform(X_test) # Transforming test set

# Linear Regression

In [None]:
# Fit the model to the train data
lin_reg = LinearRegression().fit(X_train_transformed, y_train)
print("Train score:", lin_reg.score(X_train_transformed, y_train))

In [None]:
# Predicting on Linear Regression model
y_pred = lin_reg.predict(X_test_transformed)
print("R2 score on Validation set:", r2_score(y_test, y_pred)) # r2 score

## Linear Regression Scores:-
- Train Score : 0.8562721209163499
- Validation Score : 0.8787869536470465
- Test Score (Final score) : 0.71214 

# Ridge Regression 
- with `PolynomialFeatures`

In [None]:
# Define pipeline with PolynimialFeatures and Ridge
ridge_reg_pipeline = Pipeline([('poly', PolynomialFeatures(degree=2)),
                               ('ridge', Ridge(alpha=0.5))])

# Fit the model to the train data
ridge_reg = Ridge(alpha=0.5).fit(X_train_transformed, y_train)
print("Train score:", ridge_reg.score(X_train_transformed, y_train)) 

In [None]:
y_test_pred = ridge_reg.predict(X_test_transformed)
print("R2 score on Validation set:", r2_score(y_test, y_test_pred)) 

## Ridge Regression Scores:-
- Train Score : 0.8562721208905147
- Validation Score : 0.8787869536470465
- Test Score (Final score) : 0.783

# KNeighborsRegressor with HPT

In [None]:
knn = KNeighborsRegressor()

# Define the grid of hyperparameters to search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9], 'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='r2')

# Fit the model to the data
grid_search.fit(X_train_transformed, y_train)

# Train score
print("Train score:", grid_search.score(X_train_transformed, y_train)) 

# Make predictions on the validation set
y_pred = grid_search.best_estimator_.predict(X_test_transformed)

# Evaluate the performance of the model
r2 = r2_score(y_test, y_pred)
print("R2 score on Validation set:", r2)

## KNeighborsRegressor Scores:-
- Train Score : 0.9999999999999912
- Validation Score : 0.9121901223906438
- Test Score (Final score) : 0.75751

# Decision Tree Regressor

In [None]:
# Fit the model to train data
tree_reg = DecisionTreeRegressor().fit(X_train_transformed, y_train)

# Train Score
print("Train score:", tree_reg.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation data
y_test_pred = tree_reg.predict(X_test_transformed)

print("MSE on validation set:", mean_squared_error(y_test, y_test_pred))
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## Decision Tree Scores:-
- Train Score : 1.0
- Validation Score : 0.8954141083567229
- Test Score (Final score) : 0.90629

# Random Forest Regressor

In [None]:
# Instantiating RandomForestRegressor
forest_reg = RandomForestRegressor()

# Fit the model to train data
forest_reg.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", forest_reg.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation data
y_test_pred = forest_reg.predict(X_test_transformed)
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## Random Forest Scores:-
- Train Score : 0.9921845298944929
- Validation Score : 0.958030168298421
- Test Score (Final score) : 0.93948 (without hpt)

## Fine tuning RandomForest Model (HPT)
- using `RandomizedSearchCV`

In [None]:
# Create a search grid of parameters that will be shuffled through

param_grid = { 'bootstrap': [True], 
               'max_depth': [None, 3, 4 ,5],
               'max_features': ['auto', 'log2', 'sqrt'],
               'min_samples_leaf': [1, 2, 4], 
               'min_samples_split': [2, 5, 10],
               'n_estimators': [20, 50, 100, 150]
  }

In [None]:
# Using the random grid and searching for best hyperparameters

rf = RandomForestRegressor() #creating base model

rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = param_grid, 
                               scoring='r2',
                               n_iter = 10, 
                               cv = 5, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1,
  )

rf_random.fit(X_train_transformed, y_train) #fit is to initiate training process

# Train Score
print("Train score:", rf_random.score(X_train_transformed, y_train))

In [None]:
# Predict on validation set
y_test_pred = rf_random.best_estimator_.predict(X_test_transformed) 

# r2 score of validation set
print(r2_score(y_test, y_test_pred)) 

In [None]:
# Best estimator
print("Best estimator:", rf_random.best_estimator_)

## RandomForest with HPT Scores:-
- Train Score : 0.9721485771517778
- Validation Score : 0.959106120321911
- Test Score (Final score) : 0.93825 (with hpt)
- Best Estimator : RandomForestRegressor(max_features='auto', min_samples_leaf=4,
                      min_samples_split=5)

# ExtraTree Regressor

In [None]:
# Instantiating ExtraTree Regressor
et = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Fit the model to train data
et.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", et.score(X_train_transformed, y_train))

In [None]:
# Predict on validation set
y_test_pred = et.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## ExtraTree Regressor Scores:-
- Train Score : 0.9999999999997962
- Validation Score : 0.9577885930829522
- Test Score (Final score) : 0.93872

# XGBoost

In [None]:
# Instantiating XGBoost Regressor
xgb_reg = xgb.XGBRegressor(n_estimators=55, random_state=42)

# Fit the model to train data
xgb_reg.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", xgb_reg.score(X_train_transformed, y_train))

In [None]:
# Predict on validation set
y_test_pred = xgb_reg.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

# # r2_score(y_test, y_test_pred), n_estimators=100 => 0.9570924052868452
# # r2_score(y_test, y_test_pred), n_estimators=200 => 0.9565649812728337
# # r2_score(y_test, y_test_pred), n_estimators=300 => 0.956104914936009
# # r2_score(y_test, y_test_pred), n_estimators=50 => 0.957642158905133

## XGBoost Regressor Scores:-
- Train Score : 0.9707993350027639
- Validation Score : 0.9587988263827593
- Test Score (Final score) : 0.9404

# XGBoost with GridSearchCV HPT

In [None]:
# Instantiating XGBoost Regressor
xgb_reg = xgb.XGBRegressor(random_state=42)

# Parameter Grid
param_grid = {
    'n_estimators': [45, 50, 55],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform grid search with cross-validation
grid_search_xgb = GridSearchCV(estimator=xgb_reg, 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=5, 
                           n_jobs=-1)

# Fit the model to train data
grid_search_xgb.fit(X_train_transformed, y_train)

# Print the best parameters and corresponding performance
print("Best Parameters: ", grid_search_xgb.best_params_)
print("Best MSE: ", -grid_search_xgb.best_score_)

In [None]:
# Train Score
print("Train score:", grid_search_xgb.score(X_train_transformed, y_train))

In [None]:
# Predicting the best estimator on validation set
y_test_pred = grid_search_xgb.best_estimator_.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## XGBoost with GridSearchCV HPT Scores:-
- Train Score : -22.732853359104283
- Validation Score : 0.9606110336184028
- Test Score (Final score) : 0.9449
- Best Parameters : {'colsample_bytree': 1.0, 'learning_rate': 0.1, 
        'max_depth': 7, 'n_estimators': 55, 'subsample': 1.0}


# AdaBoostRegressor

In [None]:
# Instantiating AdaBoost Regressor 
ada = AdaBoostRegressor(random_state=42)

# Fit the model to train data
ada.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", ada.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation set
y_test_pred = ada.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## AdaBoostRegressor Scores:-
- Train Score : 0.7175502903194824
- Validation Score : 0.7185086329395634
- Test Score (Final score) : 0.68406

# BaggingRegressor with DesicionTreeReg estimator

In [None]:
# Instantiating Bagging Regressor
bag_reg_dt = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42)

# Fit the model to train data
bag_reg_dt.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", bag_reg_dt.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation set
y_test_pred = bag_reg_dt.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## Bagging Regressor Scores:-
- Train Score : 0.9918079079965448
- Validation Score : 0.9579793043810756
- Test Score (Final score) : 0.94036

# GradientBoostingRegressor

In [None]:
# Instantiating GradientBoosting Regressor
gbr = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, random_state=42)

# Fit the model to train data
gbr.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", gbr.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation set
y_test_pred = gbr.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## GradientBoosting Regressor Scores:-
- Train Score : 0.7276096915248715
- Validation Score : 0.7277481362227073

# MLPRegressor

In [None]:
# Instantiating MLP Regressor
MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Fit the model to train data
MLP.fit(X_train_transformed, y_train)

# Train Score
print("Train score:", MLP.score(X_train_transformed, y_train))

In [None]:
# Predicting on validation set
y_test_pred = MLP.predict(X_test_transformed)

# r2 score of validation set
print("R2 score on validation set:", r2_score(y_test, y_test_pred))

## MLP Regressor Scores:-
- Train Score : 0.948900745734809
- Validation Score : 0.9514413717481092

# Comparing top Models

## Model 1 :- RandomForest Regressor
### Without HPT
- Train Score : 0.9921845298944929
- Validation Score : 0.958030168298421
- Test Score (Final score) : 0.93948 (without hpt)

### With HPT (RandomizedSearchCV)
- Train Score : 0.9721485771517778
- Validation Score : 0.959106120321911
- Test Score (Final score) : 0.93825 (with hpt)
- Best Estimator : RandomForestRegressor(max_features='auto', min_samples_leaf=4,
                      min_samples_split=5)

---------

## Model 2 :- BaggingRegressor with DesicionTreeReg Estimator
- Train Score : 0.9918079079965448
- Validation Score : 0.9579793043810756
- Test Score (Final score) : 0.94036

---------

## Model 3 :- XGBoost
### Without HPT
- Train Score : 0.9707993350027639
- Validation Score : 0.9587988263827593
- Test Score (Final score) : 0.9404

### With HPT (GridSearchCV) - BEST MODEL
- Train Score : -22.732853359104283
- Validation Score : 0.9606110336184028
- Test Score (Final score) : 0.9449
- Best Parameters : {'colsample_bytree': 1.0, 'learning_rate': 0.1, 
        'max_depth': 7, 'n_estimators': 55, 'subsample': 1.0}


# Conclusion

- RandomForestRegressor with HPT gives a final prediction score of **0.93825**, then Bagging Regressor with base Estimator Decision Tree Regressor gives Test score **0.94036** and the best score obtained is by using XGBoost with GridSearchCV - **0.9449**.
- It can generally be observed that Tree Regeression models works better in this case.

# Test Data loading and transforming

In [None]:
X_test_new = pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")
X_test_new.head()

In [None]:
# Converting to pandas datetime
X_test_new['tpep_dropoff_datetime'] = pd.to_datetime(X_test_new['tpep_dropoff_datetime'])
X_test_new['tpep_pickup_datetime'] = pd.to_datetime(X_test_new['tpep_pickup_datetime'])

# Code to create a new column `Duration_mins` which denotes total duration in mins

X_test_new["Duration_mins"] = X_test_new.tpep_dropoff_datetime > X_test_new.tpep_pickup_datetime 
n = len(X_test_new["Duration_mins"])
for i in range(n):
  if X_test_new["Duration_mins"][i] == True:
    X_test_new["Duration_mins"][i] = round((X_test_new.tpep_dropoff_datetime[i] - X_test_new.tpep_pickup_datetime[i]).seconds / 60)
  else:
    X_test_new["Duration_mins"][i] = round((X_test_new.tpep_pickup_datetime[i] - X_test_new.tpep_dropoff_datetime[i]).seconds / 60)
    
# Pickup_day
X_test_new["Pickup_day"] = X_test_new['tpep_pickup_datetime'].dt.day_name()
X_test_new.head()

# Drop columns
X_test_new.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag'], axis=1, inplace=True)

In [None]:
X_test_new_transformed = full_pipeline.transform(X_test_new) # Transforming test feature set
y_pred_new = MLP.predict(X_test_new_transformed) # Predicting on test set

In [None]:
submission = pd.DataFrame(columns = ["ID","total_amount"])
submission["ID"] = [i for i in range(1,len(y_pred_new)+1)]
submission["total_amount"] = y_pred_new
submission.to_csv('submission.csv',index=False)