In [None]:
!pip install azure-ai-ml

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import azureml.core


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)


In [None]:
# Handle to the workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

# create config.json with subscription_id, workspace_name and resource_group settings

#connect to the workspace
ml_client = MLClient.from_config(DefaultAzureCredential())

print(ml_client)

In [None]:
# to get larger datasets: http://jmcauley.ucsd.edu/data/amazon/

In [None]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz -P data/

In [None]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

pdf_main = getDF('data/reviews_Automotive_5.json.gz')
pdf_main.shape

In [None]:
pdf_main.loc[pdf_main['overall'] >= 4, 'sentiment'] = 1
pdf_main.loc[pdf_main['overall'] < 3, 'sentiment'] = 0

pdf_main.head()

In [None]:
def generate_datasets(pdf_target_training, label = 'sentiment'):
    X_train, X_test_val, y_train, y_test_val = train_test_split(pdf_target_training.drop(label, axis=1), pdf_target_training[label],
                                                        stratify=pdf_target_training[label],
                                                        shuffle=True,
                                                        test_size=0.20)

    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val,
                                                        stratify=y_test_val,
                                                        shuffle=True,
                                                        test_size=0.5)
    pdf_X_train = X_train
    pdf_X_val = X_val
    pdf_X_test = X_test

    pdf_X_train['sentiment'] = y_train
    pdf_X_val['sentiment'] = y_val
    pdf_X_test['sentiment'] = y_test
    
    print(f'Total records for: "pdf_X_train": [{pdf_X_train.shape[0]}]')
    print(f'Total records for: "pdf_X_val": [{pdf_X_val.shape[0]}]')
    print(f'Total records for: "pdf_X_test": [{pdf_X_test.shape[0]}]')
    
    return pdf_X_train, pdf_X_val, pdf_X_test

In [None]:
pdf_train, pdf_val, pdf_test = generate_datasets(pdf_main[['reviewText', 'sentiment']].dropna(), 'sentiment')

## Prepare MLTable data and register Datasets

In [None]:
import os

data_dir = "./data/table"
os.makedirs(data_dir, exist_ok=True)


In [None]:
%%writefile {data_dir}/MLTable
paths: 
  - file: ./*.csv
transformations: 
  - read_delimited: 
      delimiter: ',' 
      encoding: 'ascii' 
    

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


# Load Training set
pdf_train.to_csv(os.path.join(data_dir, "data.csv"))

ds_train_set = Data(
    path=data_dir,
    type=AssetTypes.MLTABLE,
    description="Small amazon review for sentiment analysis [train set]",
    name="train_setv2"
)
ml_client.data.create_or_update(ds_train_set)

# Load Validataion

pdf_val.to_csv('./data/table/data.csv')

ds_validate_set = Data(
    path=data_dir,
    type=AssetTypes.MLTABLE,
    description="Small amazon review for sentiment analysis [validate set]",
    name="validate_setv2"
)
ml_client.data.create_or_update(ds_validate_set)

pdf_test.to_csv('./data/table/data.csv')

ds_test_set = Data(
    path=data_dir,
    type=AssetTypes.MLTABLE,
    description="Small amazon review for sentiment analysis [test set]",
    name="test_setv2"
)
ml_client.data.create_or_update(ds_test_set)

In [None]:
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml.entities import Environment

# Retrieve an already attached Azure Machine Learning Compute.
cluster_name = "gpucluster"
compute_target = ml_client.compute.get(cluster_name)
print(compute_target)


env = Environment(
   name="AzureML-AutoML-DNN-Text-GPU"
)

In [None]:

from azure.ai.ml import Input, command
from azure.ai.ml.entities import Data, Environment
from azure.ai.ml.constants import AssetTypes

source_directory = "./project"

job_inputs = { "training-dataset": Input(path=ds_train_set.id),
               "val-dataset": Input(path=ds_validate_set.id),
               "test-dataset": Input(path=ds_test_set.id),
               "target-name": 'sentiment',   
               "text-field": 'reviewText',
                "is-test": 0,
                "is-final": 0,
                "is-jump": 1,
                "is-local": 0,
                "evaluation-strategy": "epoch"
}

job = command(
    code=source_directory,
    command="train_transformer.py --val-dataset ${{inputs.val_dataset}} --training-dataset ${{inputs.training-dataset}} --test-dataset ${{inputs.test-dataset}} --target-name ${{inputs.target-name}}",
    inputs=job_inputs,
    environment=env,
    compute=cluster_name,
)


In [None]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling(
    {
        '--base-checkpoint': choice("bert-base-cased"), #, "bert-base-cased"), # , "bert-large-cased", "microsoft/deberta-v3-small", "distilbert-base-uncased", "bert-base-uncased"),
        '--batch-size': choice(8),
        '--no-epochs': choice(4),
        '--learning-rate': choice(5.5e-5, 5e-5, 4.5e-5, 4e-5, 5.5e-5, 6e-5, 3.5e-5, 6.5e-5),
        '--warmup-steps': choice(0),
        '--weight-decay': choice(0.0),
        '--adam-beta1': choice(0.9),
        '--adam-beta2': choice(0.999),
        '--adam-epsilon': choice(1e-8)
    }
)


In [None]:
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.1)
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='eval_f1_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=3)


In [None]:
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

hd_step_name='HyperDrive_Step'
hd_step = HyperDriveStep(
    name=hd_step_name,
    hyperdrive_config=hyperdrive_config,
    allow_reuse=True)


In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cpu_compute = ComputeTarget(workspace=ws, name="gpucluster")

In [None]:
env_cpu = Environment.get(workspace=ws, name="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu")

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

rcfg = RunConfiguration()
rcfg.environment = env_cpu

register_model_step = PythonScriptStep(script_name='register_model.py',
                                       source_directory=source_directory,
                                       name="Register_Best_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--is-test', 0,
                                                  '--test-run-id', '',
                                                  '--metric-name', 'test_f1_weighted',
                                                  '--second-metric', 'test_f1',
                                                  '--target-name', 'sentiment',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

register_model_step.run_after(hd_step)


In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# conda_dep = CondaDependencies()
# conda_dep.add_pip_package("azureml-sdk")
# conda_dep.env

rcfg = RunConfiguration() # conda_dependencies=conda_dep)
rcfg.environment = env_cpu

deploy_model_step = PythonScriptStep(script_name='deploy_model.py',
                                       source_directory=source_directory,
                                       name="Deploy_Latest_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--endpoint-name', 'sentiment-endpoint',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

deploy_model_step.run_after(register_model_step)


In [None]:
exp = Experiment(workspace=ws, name='transformer_hp')
steps = [deploy_model_step]
pipeline = Pipeline(workspace=ws, steps=steps)


In [None]:
pipeline.submit(exp.name, credential_passthrough=True)


In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')

pipeline_name = f"Sentiment-Classifier-{timenow}-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_pipeline.id))