## Author: Kubam Ivo
## Date : 7/16/2020

### Tutorial: Use Automated machine learning to predict taxi fares

In [None]:
# Download and prepare Data
from azureml.opendatasets import NycTlcGreen
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [None]:
green_taxi_df = pd.DataFrame([])
start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")

for sample_month in range(12):
    temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
        .to_pandas_dataframe()
    green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))

green_taxi_df.head(10)

In [None]:
# Creating various time-based features from pickup datetime field
def build_time_features(vector):
    pickup_datetime = vector[0]
    month_num = pickup_datetime.month
    day_of_month = pickup_datetime.day
    day_of_week = pickup_datetime.weekday()
    hour_of_day = pickup_datetime.hour

    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day))

green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day"]] = green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1)
green_taxi_df.head(10)

In [None]:
# Removing unnecessary columns
columns_to_remove = ["lpepPickupDatetime", "lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax",
                     "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID",
                     "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"
                    ]
for col in columns_to_remove:
    green_taxi_df.pop(col)

green_taxi_df.head(5)

In [None]:
# summary statistics
green_taxi_df.describe() 

In [None]:
# Cleaning the dataset
final_df = green_taxi_df.query("pickupLatitude >=40.53 and pickupLatitude<=40.88") 
final_df = final_df.query('pickupLongitude>=-74.09 and pickupLongitude<=-73.72')
final_df = final_df.query('tripDistance>=0.25 and tripDistance<31')
final_df = final_df.query('passengerCount>0 and totalAmount>0')

#dropping columns
final_df = final_df.drop(["pickupLongitude", "pickupLatitude", "dropoffLongitude", "dropoffLatitude"], axis=1)


In [None]:
final_df.describe()

In [None]:
# Configure and connect to AML workspace
from azureml.core import Workspace
ws = Workspace.from_config()

In [None]:
# Splitting data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(final_df, test_size=0.2, random_state=123)

In [None]:
# AutoML settings

import logging

automl_settings = {
    'iteration_timeout_minutes': 2,
    'experiment_timeout_minutes': 20,
    'enable_early_stopping': True,
    'primary_metric': 'spearman_correlation',
    'featurization': 'auto',
    'verbosity': logging.INFO,
    'n_cross_validations': 5
}

In [None]:
# Configuring AutoMl
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=X_train,
                             label_column_name="totalAmount",
                             **automl_settings)

In [None]:
# Creating an experiment
from azureml.core import Experiment
experiment = Experiment(ws, 'taxi-experiment')
local_run = experiment.submit(automl_config, show_output=True)

In [None]:
# Explore the results using Jupyter widget
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [None]:
# Retrieve the best model
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

In [None]:
# testing the model accuracy
y_predict = fitted_model.predict(X_test.values)
print(y_predict[:10])

In [None]:
best_run