In [1]:
!az login

^C


In [3]:
!az account show

{
  "environmentName": "AzureCloud",
  "homeTenantId": "cf36141c-ddd7-45a7-b073-111f66d0b30c",
  "id": "1d374132-87d3-49d8-a13a-910b42de3dde",
  "isDefault": true,
  "managedByTenants": [],
  "name": "Visual Studio Professional Subscription",
  "state": "Enabled",
  "tenantId": "cf36141c-ddd7-45a7-b073-111f66d0b30c",
  "user": {
    "name": "h.g.gunasekaran@avanade.com",
    "type": "user"
  }
}


In [14]:
import os
import argparse
import mlflow
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import AmlCompute

In [5]:
# authenticate
credential = DefaultAzureCredential()
# # Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="1d374132-87d3-49d8-a13a-910b42de3dde",
    resource_group_name="capstoneamazonrg",
    workspace_name="capstoneamazonml"
)

In [7]:
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [123]:
%%writefile {dependencies_dir}/dataprep.yml
name: dataprep
channels:
- anaconda
- conda-forge
dependencies:
- python=3.11.5
- pip=21.3.1
- pandas~=1.3.0
- scipy~=1.10.0
- numpy~=1.22.0
- pip:
  - wheel~=0.38.1
  - azureml-core==1.55.0.post1
  - azureml-defaults==1.55.0
  - azureml-mlflow==1.55.0
  - azureml-telemetry==1.55.0
  - azureml-automl-common-tools==1.55.0
  - scikit-learn~=1.0.0
  - joblib~=1.2.0
  # vulnerabilities
  - cryptography>=42.0.4
  - certifi >= 2023.07.22
  - requests >= 2.31.0
  - nltk==3.8.1
  - mlfow==2.10.2

Overwriting ./dependencies/dataprep.yml


In [8]:
%%writefile {dependencies_dir}/dataprep.yml
channels:
  - anaconda
  - conda-forge
dependencies:
  - python=3.9
  - pip=21.3.1
  - pandas~=1.3.0
  - scipy~=1.10.0
  - numpy~=1.22.0
  - pip:
      - wheel~=0.38.1
      - azureml-core==1.55.0.post1
      - azureml-defaults==1.55.0
      - azureml-mlflow==1.55.0
      - azureml-telemetry==1.55.0
      - azureml-automl-common-tools==1.55.0
      - scikit-learn~=1.0.0
      - joblib~=1.2.0
      - nltk==3.8.1
name: dataprep

Overwriting ./dependencies/dataprep.yml


In [124]:
from azure.ai.ml.entities import Environment

data_prep_env_name = "data_prep_env"

data_prep_job_env = Environment(
    name=data_prep_env_name,
    description="For data preparation",
    tags={"data_prep": "0.1"},
    conda_file=os.path.join(dependencies_dir, "dataprep.yml"),
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
)
data_prep_job_env = ml_client.environments.create_or_update(data_prep_job_env)

print(
    f"Environment with name {data_prep_job_env.name} is registered to workspace, the environment version is {data_prep_job_env.version}"
)

Environment with name data_prep_env is registered to workspace, the environment version is 1


In [9]:
# Name assigned to the compute cluster
cpu_compute_target = "di-cluster"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure Machine Learning compute object with the intended parameters
    cpu_cluster = AmlCompute(
        name=cpu_compute_target,
        # Azure Machine Learning Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS11_V2",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=1,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=120,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )
    print(
        f"AMLCompute with name {cpu_cluster.name} will be created, with compute size {cpu_cluster.size}"
    )
    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster)

Creating a new cpu compute target...
AMLCompute with name di-cluster will be created, with compute size STANDARD_DS11_V2


In [32]:
# Read Amazon Fine Food Review
csv_path = 'Data/Raw/amazon_fine_food_reviews.csv'
reviews = pd.read_csv(csv_path, header=0, index_col=0)

In [33]:
reviews.head(10)

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [None]:
reviews_small = reviews[1:101]
csv_path_small = 'Data/Raw/amazon_fine_food_reviews_small.csv'
reviews_small.to_csv(csv_path)

In [108]:
import datetime
import pandas as pd
import re
import nltk
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords

# Create deduplicated frame
# First by sorting the values by their ProductId
# Second by dropping duplicates using all columns (which in proven cases are mostly similar) but the product id (which we have seen it differs)
# reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])
# reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'ProfileName', 'Score', 'Time', 'Summary', 'Text'])
# Either keeping or removing Time is valid depending on the approach
reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'Text'])

# Convert time
reviews_dd['DateTime']=reviews_dd['Time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

# Remove null values
reviews_dd.dropna(inplace=True)

reviews_dd['Year']=reviews_dd['Time'].apply(lambda x: datetime.datetime.fromtimestamp(x).year)

# Create empty column for binarized score
reviews_dd['binary_score'] = 0

# Replace only the positive reviews by 1, leave the others with 0 (being negative)
reviews_dd.loc[reviews_dd['Score'] > 3,'binary_score'] = 1

# Lemmatization and Stemming
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english') # SnowballStemmer() and other options
nltk.download('stopwords')

#Removing the word 'not' from stopwords
default_stopwords = set(stopwords.words('english'))
#excluding some useful words from stop words list as we doing sentiment analysis
excluding = set(['against','not','don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
             'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
             'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",'shouldn', "shouldn't", 'wasn',
             "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])

custom_stopwords = default_stopwords - excluding

reviews_dd['CleanedText'] = reviews_dd['Text'].apply(lambda x:preprocess(x))

# Removed SWR
reviews_dd['CleanedSwrText'] = reviews_dd['Text'].apply(lambda x:prepare_text(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\h.g.gunasekaran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
dataprep_dir = "./AzureML/DataPrep"
os.makedirs(dataprep_dir, exist_ok=True)

In [65]:
%%writefile {dataprep_dir}/dataprep.py
import argparse
import datetime
import pandas as pd
import mlflow
import mlflow.sklearn
import re
import nltk
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import os

def preprocess(sentence):
     
    sentence = str(sentence)
    sentence = sentence.lower()
    # sentence = sentence.replace('{html}',"") 
    # For Regex Pattern Object
    cleanr = re.compile(r'<\s*[^>]*\s*>')
    cleantext = re.sub(cleanr, ' ', sentence)
    rem_hyp = re.sub(r'(\w+)-(\w+)', r'\1 \2', cleantext)
    rem_punc = re.sub(r'[^\w\s]', '', rem_hyp)
    # re_clean = re.sub(r'[^a-z0-9A-Z_]',' ', cleantext)
    rem_http = re.sub(r'http\S+', '', rem_punc)
    rem_url = re.sub(r"www.\S+", " ", rem_http)
    rem_pat = re.sub("\s*\b(?=\w*(\w)\1{2,})\w*\b",' ', rem_url)
    rem_num = re.sub('[0-9]+', '', rem_pat)

    return rem_num

def tokenize_data(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)  
    return tokens

def remove_stop_words(cleant_text):
    
    # Lemmatization and Stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english') # SnowballStemmer() and other options
    nltk.download('stopwords')

    #Removing the word 'not' from stopwords
    default_stopwords = set(stopwords.words('english'))
    #excluding some useful words from stop words list as we doing sentiment analysis
    excluding = set(['against','not','don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
                 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
                 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",'shouldn', "shouldn't", 'wasn',
                 "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])

    custom_stopwords = default_stopwords - excluding
    
    tokens = tokenize_data(cleant_text)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in custom_stopwords]
    # stem_words=[stemmer.stem(w) for w in filtered_words]
    # lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    
    return " ".join(filtered_words)

def prepare_text(text):
    
    cleant_text = preprocess(text)
    remove_sw = remove_stop_words(cleant_text)

    return remove_sw

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    reviews = pd.read_csv(args.data, header=0, index_col=0)

    mlflow.log_metric("num_samples", reviews.shape[0])
    mlflow.log_metric("num_features", reviews.shape[1] - 1)
    
    print(reviews.columns)
    
    # Create deduplicated frame
    # First by sorting the values by their ProductId
    # Second by dropping duplicates using all columns (which in proven cases are mostly similar) but the product id (which we have seen it differs)
    # reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])
    # reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'ProfileName', 'Score', 'Time', 'Summary', 'Text'])
    # Either keeping or removing Time is valid depending on the approach
    reviews_dd = reviews.sort_values(by='ProductId').drop_duplicates(subset=['UserId', 'Text'])

    # Convert time
    reviews_dd['DateTime']=reviews_dd['Time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

    # Remove null values
    reviews_dd.dropna(inplace=True)

    reviews_dd['Year']=reviews_dd['Time'].apply(lambda x: datetime.datetime.fromtimestamp(x).year)

    # Create empty column for binarized score
    reviews_dd['binary_score'] = 0
    
    # Replace only the positive reviews by 1, leave the others with 0 (being negative)
    reviews_dd.loc[reviews_dd['Score'] > 3,'binary_score'] = 1

    reviews_dd['CleanedText'] = reviews_dd['Text'].apply(lambda x:preprocess(x))

    # Removed SWR
    reviews_dd['CleanedSwrText'] = reviews_dd['Text'].apply(lambda x:prepare_text(x))
    
    print(reviews_dd.head(10))
    
    reviews_dd_train, reviews_dd_test = train_test_split(
        reviews_dd,
        test_size=args.test_train_ratio,
    )

    # output paths are mounted as folder, therefore, we are adding a filename to the path
    reviews_dd_train.to_csv(os.path.join(args.train_data, "data.csv"), index=False)

    reviews_dd_test.to_csv(os.path.join(args.test_data, "data.csv"), index=False)

    ####################
    #</prepare the data>
    ####################

    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

Overwriting ./AzureML/DataPrep/dataprep.py


In [86]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output
from azure.ai.ml import load_component

data_prep_component = command(
    name="data_prep",
    display_name="Data preparation for training",
    description="",
    inputs={
        "data": Input(type="uri_folder"),
        "test_train_ratio": Input(type="number"),
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code="./AzureML/DataPrep/",  # location of source code
    command="""python dataprep.py \
            --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{data_prep_job_env.name}:7",
)

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

reviews_model = reviews_dd.drop(['ProductId', 'UserId', 'ProfileName',
       'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time',
       'Summary', 'Text', 'DateTime', 'Year',
       'CleanedText'], axis=1)
reviews_model.dropna(inplace=True)

reviews_model.head()

# Define training datat and labels
X = reviews_model['CleanedSwrText']
y = reviews_model['binary_score']

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 ,random_state=99)

# Use CountVectorizer to convert reviews into matrices
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)

# equivalent to:
# vect.fit(X_train) # words that are on the training set
# X_train_dtm = vect.transform(X_train)

# Perform the same in test
X_test_dtm = vect.transform(X_test)

# Use Naive Bayes to predict binary score
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# Calculate accuracy.
print((metrics.accuracy_score(y_test, y_pred_class)))

print(classification_report(y_test, y_pred_class))

In [24]:
train_dir = "./AzureML/Train"
os.makedirs(train_dir, exist_ok=True)

In [77]:
%%writefile {train_dir}/train.py
import argparse
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import os
import pandas as pd
import mlflow


def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])


# Start Logging
mlflow.start_run()

# enable autologging
mlflow.sklearn.autolog()

os.makedirs("./outputs", exist_ok=True)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    parser.add_argument("--registered_model_name", type=str, help="model name")
    parser.add_argument("--model", type=str, help="path to model file")
    args = parser.parse_args()

    # paths are mounted as folder, therefore, we are selecting the file from folder
    train_df = pd.read_csv(select_first_file(args.train_data))

    # Extracting the label column
    y_train = train_df.pop("binary_score")

    # convert the dataframe values to array
    X_train = train_df.pop("CleanedSwrText")

    # paths are mounted as folder, therefore, we are selecting the file from folder
    test_df = pd.read_csv(select_first_file(args.test_data))

    # Extracting the label column
    y_test = test_df.pop("binary_score")

    # convert the dataframe values to array
    X_test = test_df.pop("CleanedSwrText")

    print(f"Training with data of shape {X_train.shape}")
    
    # Use CountVectorizer to convert reviews into matrices
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)

    # equivalent to:
    # vect.fit(X_train) # words that are on the training set
    # X_train_dtm = vect.transform(X_train)

    # Perform the same in test
    X_test_dtm = vect.transform(X_test)

    # Use Naive Bayes to predict binary score
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred = nb.predict(X_test_dtm)
    
    # Calculate accuracy.
    print((metrics.accuracy_score(y_test, y_pred)))

    print(classification_report(y_test, y_pred))

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=nb,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=nb,
        path=os.path.join(args.model, "trained_model"),
    )

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Overwriting ./AzureML/Train/train.py


In [78]:
%%writefile {train_dir}/train.yml
# <component>
name: review_classifier_model
display_name: Review Classifier Model
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  train_data: 
    type: uri_folder
  test_data: 
    type: uri_folder  
  registered_model_name:
    type: string
outputs:
  model:
    type: uri_folder
code: .
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
command: >-
  python train.py 
  --train_data ${{inputs.train_data}} 
  --test_data ${{inputs.test_data}} 
  --registered_model_name ${{inputs.registered_model_name}} 
  --model ${{outputs.model}}
# </component>

Overwriting ./AzureML/Train/train.yml


In [79]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
train_component = load_component(source=os.path.join(train_dir, "train.yml"))

# Now we register the component to the workspace
train_component = ml_client.create_or_update(train_component)

# Create (register) the component in your workspace
print(
    f"Component {train_component.name} with Version {train_component.version} is registered"
)

Uploading Train (0.0 MBs): 100%|##########| 3886/3886 [00:00<00:00, 8237.70it/s]




Component review_classifier_model with Version 2024-03-07-14-24-11-2352776 is registered


In [82]:
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute=cpu_compute_target,
    description="E2E data_perp-train pipeline",
)
def reviews_rating_pipeline(
    pipeline_job_data_input,
    pipeline_job_test_train_ratio,
    pipeline_job_registered_model_name,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep_component(
        data=pipeline_job_data_input,
        test_train_ratio=pipeline_job_test_train_ratio,
    )

    # using train_func like a python call with its own inputs
    train_job = train_component(
        train_data=data_prep_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_prep_job.outputs.test_data,  # note: using outputs from previous step
        registered_model_name=pipeline_job_registered_model_name,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.train_data,
        "pipeline_job_test_data": data_prep_job.outputs.test_data,
    }

In [83]:
import pandas as pd
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("AmazonFineFoodReivewsSmall", version="1")

registered_model_name = "reviews_rating_classifier"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = reviews_rating_pipeline(
    pipeline_job_data_input=Input(type="uri_file", path=data_asset.path),
    pipeline_job_test_train_ratio=0.25,
    pipeline_job_registered_model_name=registered_model_name,
)

Found the config file in: .\config.json


In [84]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="e2e_registered_components",
)
ml_client.jobs.stream(pipeline_job.name)

RunId: quiet_garlic_498b6jswxb
Web View: https://ml.azure.com/runs/quiet_garlic_498b6jswxb?wsid=/subscriptions/1d374132-87d3-49d8-a13a-910b42de3dde/resourcegroups/capstoneamazonrg/workspaces/capstoneamazonml

Streaming logs/azureml/executionlogs.txt

[2024-03-07 14:45:44Z] Completing processing run id 601e9191-5ba8-441d-ae30-a699e1d3c00f.
[2024-03-07 14:45:45Z] Submitting 1 runs, first five are: 278dfb87:c7d33b28-e67c-4522-aa2d-f458d5844469
[2024-03-07 14:50:31Z] Completing processing run id c7d33b28-e67c-4522-aa2d-f458d5844469.

Execution Summary
RunId: quiet_garlic_498b6jswxb
Web View: https://ml.azure.com/runs/quiet_garlic_498b6jswxb?wsid=/subscriptions/1d374132-87d3-49d8-a13a-910b42de3dde/resourcegroups/capstoneamazonrg/workspaces/capstoneamazonml



In [87]:
import uuid

# Create a unique name for the endpoint
online_endpoint_name = "rating-endpointt-" + str(uuid.uuid4())[:8]

In [89]:
from azure.ai.ml.entities import ManagedOnlineEndpoint

# define an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is an online endpoint",
    auth_mode="key",
    tags={
        "training_dataset": "ratings_defaults",
    },
)

In [90]:
# create the online endpoint
# expect the endpoint to take approximately 2 minutes.

endpoint = ml_client.online_endpoints.begin_create_or_update(endpoint).result()

In [91]:
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

print(
    f'Endpoint "{endpoint.name}" with provisioning state "{endpoint.provisioning_state}" is retrieved'
)

Endpoint "rating-endpointt-ccd406b9" with provisioning state "Succeeded" is retrieved


In [92]:
   from azure.ai.ml.entities import (
       AutoPauseSettings,
       AutoScaleSettings,
       DefaultScaleSettings,
       IdentityConfiguration,
       ManagedIdentityConfiguration,
       SynapseSparkCompute,
   )

In [93]:
from azure.ai.ml.entities import ManagedOnlineDeployment

registered_model_name = "reviews_rating_classifier"

# Let's pick the latest version of the model
latest_model_version = max(
    [int(m.version) for m in ml_client.models.list(name=registered_model_name)]
)

print(latest_model_version)

# Choose the latest version of our registered model for deployment
model = ml_client.models.get(name=registered_model_name, version=latest_model_version)

# define an online deployment
test_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=online_endpoint_name,
    model=model,
    instance_type="Standard_D2as_v4", 
    instance_count=1
)

1


In [94]:
# create the online deployment
test_deployment = ml_client.online_deployments.begin_create_or_update(
    test_deployment
).result()


# blue deployment takes 100% traffic
# expect the deployment to take approximately 8 to 10 minutes.
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

Check: endpoint rating-endpointt-ccd406b9 exists


............................................................................................

Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>


ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://rating-endpointt-ccd406b9.germanywestcentral.inference.ml.azure.com/score', 'openapi_uri': 'https://rating-endpointt-ccd406b9.germanywestcentral.inference.ml.azure.com/swagger.json', 'name': 'rating-endpointt-ccd406b9', 'description': 'this is an online endpoint', 'tags': {'training_dataset': 'ratings_defaults'}, 'properties': {'azureml.onlineendpointid': '/subscriptions/1d374132-87d3-49d8-a13a-910b42de3dde/resourcegroups/capstoneamazonrg/providers/microsoft.machinelearningservices/workspaces/capstoneamazonml/onlineendpoints/rating-endpointt-ccd406b9', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/1d374132-87d3-49d8-a13a-910b42de3dde/providers/Microsoft.MachineLearningServices/locations/germanywestcentral/mfeOperationsStatus/oe:a47cb470-a91b-4883-9a14-14ecf10908a8:9b0fd2ea-1127-4a61-a168-334e6a02c486?api-version=2022-02-01-preview'}, 'print_as_ya

In [95]:
# return an object that contains metadata for the endpoint
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

# print a selection of the endpoint's metadata
print(
    f"Name: {endpoint.name}\nStatus: {endpoint.provisioning_state}\nDescription: {endpoint.description}"
)

Name: rating-endpointt-ccd406b9
Status: Succeeded
Description: this is an online endpoint


In [105]:
import os
import json
# Create a directory to store the sample request file.
deploy_dir = "./AzureML/Resources"
os.makedirs(deploy_dir, exist_ok=True)


test_dict = {
  "input_data": {
    "columns": ["CleanedSwrText"],
    "index": [0],
    "data": [["movie good"]]
    }
}
json_object = json.dumps(test_dict, indent=4)
with open(f"{deploy_dir}/sample-request.json", "w") as outfile:
    outfile.write(json_object)

In [106]:
# test the blue deployment with the sample data
ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name="blue",
    request_file="./AzureML/Resources/sample-request.json",
)

HttpResponseError: (None) An unexpected error occurred in scoring script. Check the logs for more info.
Code: None
Message: An unexpected error occurred in scoring script. Check the logs for more info.

In [None]:
#Seems we have to process the to be predicted data too and then feed it to the model 