In [None]:
df = spark.read.load('abfss://sampledataset@mtcssynapseworkshop.dfs.core.windows.net/nyctaxi/NYCTripSmall.parquet', format='parquet')
df.printSchema()

In [None]:
from datetime import datetime
from pyspark.sql.functions import *

# To make development easier, faster, and less expensive, downsample for now
sampled_taxi_df = df

taxi_df = sampled_taxi_df.select('PassengerCount', 'TripDistanceMiles',  'PickupLongitude', 'PickupLatitude', 'DropoffLongitude', 'DropoffLatitude', 'PaymentType', 'FareAmount', 'TipAmount')
taxi_df.show(10)

In [None]:
# Random split dataset using Spark; convert Spark to pandas
training_data, validation_data = taxi_df.randomSplit([0.8,0.2], 223)

In [None]:
from azureml.core import Workspace

# Enter your workspace subscription, resource group, name, and region.
subscription_id = "89da9f33-fd31-4ece-861e-5fab7af4dc11" #you should be owner or contributor
resource_group = "mtcs-dev-aml-rg" #you should be owner or contributor
workspace_name = "mtcs-dev-aml" #your workspace name

ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)

In [None]:
import pandas 
from azureml.core import Dataset

# Get the Azure Machine Learning default datastore
datastore = ws.get_default_datastore()
training_pd = training_data.toPandas().to_csv('training_pd.csv', index=False)

# Convert into an Azure Machine Learning tabular dataset
datastore.upload_files(files = ['training_pd.csv'],
                       target_path = 'train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)
dataset_training = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/training_pd.csv')])

In [None]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 1, # It's just for workshop only
    "experiment_timeout_minutes": 15, # It's just for workshop only
    "enable_early_stopping": True,
    "primary_metric": 'r2_score',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 2}

In [None]:
from azureml.train.automl import AutoMLConfig

label = "TipAmount"

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data = dataset_training,
                             spark_context = sc,
                             model_explainability = False, 
                             label_column_name = label,
                             **automl_settings)

In [None]:
from azureml.core.experiment import Experiment

# Start an experiment in Azure Machine Learning
expName = "hyssh-aml-synapse"
experiment = Experiment(ws, expName)
tags = {"Synapse": "regression"}
local_run = experiment.submit(automl_config, show_output=True, tags = tags)

# Use the get_details function to retrieve the detailed output for the run.
run_details = local_run.get_details()

## It'll take around 40 mins to fisnish the training

# Cancel run to save time
To save time, please go to [AML Studio](https://ml.azure.com/experiments) and cancel your experiment.

![cancel](https://github.com/hyssh/synapse-workshop21/raw/master/images/cancelRun.png)

Remember find your experiment name you defined from the previous cell
```
##Replace###-aml-synapse-regression
```

In [None]:
# Get best model

best_run, fitted_model = local_run.get_output()

In [None]:
# Test best model accuracy
validation_data_pd = validation_data.toPandas()
y_test = validation_data_pd.pop(label).to_frame()
y_predict = fitted_model.predict(validation_data_pd)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Calculate root-mean-square error
y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))

print("Root Mean Square Error:")
print(rmse)

In [None]:
# Calculate mean-absolute-percent error and model accuracy 
sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = float(actual_val) - float(predict_val)
    if abs_error < 0:
        abs_error = abs_error * -1.0

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = (float(sum_errors) / float(sum_actuals))

print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the R2 score by using the predicted and actual fare prices
y_test_actual = y_test[label]
r2 = r2_score(y_test_actual, y_predict)

# Plot the actual versus predicted fare amount values
plt.style.use('ggplot')
plt.figure(figsize=(10, 7))
plt.scatter(y_test_actual,y_predict)
plt.plot([np.min(y_test_actual), np.max(y_test_actual)], [np.min(y_test_actual), np.max(y_test_actual)], color='lightblue')
plt.xlabel("Actual Tip Amount")
plt.ylabel("Predicted Tip Amount")
plt.title("Actual vs Predicted Fare Amount R^2={}".format(r2))
plt.show()

In [None]:
description = 'My automated ML model'
model_path='outputs/model.pkl'
model = best_run.register_model(model_name = 'NYCGreenTaxiModel', model_path = model_path, description = description)
print(model.name, model.version)