In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.model import Model
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.train.automl import AutoMLConfig
import pickle
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import mlflow

In [2]:
from azureml.core import Workspace, Dataset

subscription_id = '7a12af0d-a67c-4fb2-85ce-acb1b512790a'
resource_group = 'MLOps'
workspace_name = 'MLOps_WS'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [3]:
# setup mlflow

uri = workspace.get_mlflow_tracking_uri()

mlflow.set_tracking_uri(uri)

In [4]:
dataset = Dataset.get_by_name(workspace, name='weather_ds_processed')

print(dataset.name, dataset.version)

weather_ds_processed 1


In [5]:
df = dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,Column1,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_condition,Future_Weather_Condition
0,0,2006-03-31 22:00:00,"Port of Turku, Finland",9.472222,0.89,14.1197,251,15.8263,1015.13,1,1.0
1,1,2006-03-31 23:00:00,"Port of Turku, Finland",9.355556,0.86,14.2646,259,15.8263,1015.63,1,1.0
2,2,2006-04-01 00:00:00,"Port of Turku, Finland",9.377778,0.89,3.9284,204,14.9569,1015.94,1,1.0
3,3,2006-04-01 01:00:00,"Port of Turku, Finland",8.288889,0.83,14.1036,269,15.8263,1016.41,1,1.0
4,4,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1.0


In [6]:
df.shape

(96449, 11)

In [7]:
df.drop('Column1',axis=1, inplace=True)

In [8]:
df.shape

(96449, 10)

In [9]:
# training and validation set

df_training = df.iloc[:77160]   # 80% values

df_training.shape

(77160, 10)

In [10]:
df_validation = df.drop(df_training.index)
df_validation.shape

(19289, 10)

In [11]:
!mkdir Data

mkdir: cannot create directory ‘Data’: File exists


In [15]:
# storing the training and validation set

df_training.to_csv('Data/training_data.csv', index=False)

df_validation.to_csv('Data/validation_data.csv', index=False)

In [12]:
## registering the training and validation dataset -->> 

datastore = workspace.get_default_datastore()

In [13]:
datastore.upload(src_dir='Data', target_path='data')

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 2 files
Uploading Data/validation_data.csv
Uploaded Data/validation_data.csv, 1 files out of an estimated total of 2
Uploading Data/training_data.csv
Uploaded Data/training_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_6ac883d3146440aaa989892f23b18e65

In [14]:
training_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/training_data.csv'))

In [15]:
validation_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/validation_data.csv'))

In [16]:
training_ds = training_dataset.register(workspace=workspace,
                                        name='training_dataset',
                                        description='dataset to use for ML traininig')

In [17]:
validation_ds = validation_dataset.register(workspace=workspace,
                                            name='validation_dataset',
                                            description='dataset for validation of our ML models')

# Data Ingestion

In [18]:
dataset = Dataset.get_by_name(workspace, name='training_dataset')

print(dataset.name, dataset.version)

training_dataset 1


In [19]:
df = dataset.to_pandas_dataframe()

df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_condition,Future_Weather_Condition
0,2006-03-31 22:00:00,"Port of Turku, Finland",9.472222,0.89,14.1197,251,15.8263,1015.13,1,1.0
1,2006-03-31 23:00:00,"Port of Turku, Finland",9.355556,0.86,14.2646,259,15.8263,1015.63,1,1.0
2,2006-04-01 00:00:00,"Port of Turku, Finland",9.377778,0.89,3.9284,204,14.9569,1015.94,1,1.0
3,2006-04-01 01:00:00,"Port of Turku, Finland",8.288889,0.83,14.1036,269,15.8263,1016.41,1,1.0
4,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1.0


In [20]:
df.shape

(77160, 10)

In [21]:
df.columns

Index(['Timestamp', 'Location', 'Temperature_C', 'Humidity', 'Wind_speed_kmph',
       'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars',
       'Weather_condition', 'Future_Weather_Condition'],
      dtype='object')

In [22]:
# feature selection

x = df[['Temperature_C', 'Humidity', 'Wind_speed_kmph',
       'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars',
       'Weather_condition']].values

y = df['Future_Weather_Condition'].values
y

array([1., 1., 1., ..., 1., 1., 1.])

In [23]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=1)

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
sc = StandardScaler()

In [27]:
xtrain = sc.fit_transform(xtrain)

xtest = sc.transform(xtest)

# Model Training 


## 1 - Support Vector Machine

In [28]:
myexperiment = Experiment(workspace, "support-vector-machine")  # for monitoring and logging

mlflow.set_experiment("mlflow-support-vector-machine")

2023/11/03 14:37:47 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-support-vector-machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1699022267492, experiment_id='6ac17452-2241-4093-acfd-7405b271cab5', last_update_time=None, lifecycle_stage='active', name='mlflow-support-vector-machine', tags={}>

In [36]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [37]:
parameters = { 'kernel':('linear', 'rbf'), 'C':[1,10] }

In [38]:
svc = svm.SVC()  # support vector classifier

In [33]:
# initialize a run in Azureml and mlflow experiments

run = myexperiment.start_logging()
mlflow.start_run()

run.log("dataset name", dataset.name)
run.log("dataset version", dataset.version)

In [39]:
svc_grid = GridSearchCV(svc, parameters)

In [40]:
svc_grid.fit(xtrain, ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [41]:
svc_grid.get_params(deep=True)

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf'), 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [42]:
from sklearn.svm import SVC

svc = SVC(C=svc_grid.get_params(deep=True)['estimator__C'],
         kernel=svc_grid.get_params(deep=True)['estimator__kernel'])

In [43]:
svc.fit(xtrain, ytrain)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
# logging parameters to Azureml and mlflow

run.log("C", svc_grid.get_params(deep=True)['estimator__C'])
run.log("Kernel", svc_grid.get_params(deep=True)['estimator__kernel'])

In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

predicted_svc = svc.predict(xtest)

In [46]:
acc = accuracy_score(ytest, predicted_svc)
precision = precision_score(ytest, predicted_svc)
recall = recall_score(ytest, predicted_svc)

In [47]:
run.log("Test accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)

In [48]:
acc

0.9496500777604977

In [49]:
precision

0.9692012967875037

In [50]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset version': 1,
 'Kernel': 'rbf',
 'C': 1.0,
 'Precision': 0.9692012967875037,
 'Test accuracy': 0.9496500777604977,
 'Recall': 0.9734329904536373}

In [51]:
run.complete()

print("Run ID", run.id)

Run ID 62729f4a-8376-412e-9b6c-acc46b0736eb


In [57]:
mlflow.end_run()

In [52]:
workspace.get_details()

{'id': '/subscriptions/7a12af0d-a67c-4fb2-85ce-acb1b512790a/resourceGroups/MLOps/providers/Microsoft.MachineLearningServices/workspaces/MLOps_WS',
 'name': 'MLOps_WS',
 'identity': {'principal_id': 'fbed4930-dabe-4bd4-8ec7-dde3cfb27cf6',
  'tenant_id': '207d936e-f035-4978-85cc-8b810421b510',
  'type': 'SystemAssigned'},
 'location': 'eastus2',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'tags': {},
 'sku': 'Basic',
 'workspaceid': '720a7946-3ec3-49c5-b38f-d9778859842f',
 'sdkTelemetryAppInsightsKey': '066b0d89-53d2-4b58-8bc8-ac382527e4fb',
 'description': '',
 'friendlyName': 'MLOps_WS',
 'creationTime': '2023-10-31T12:02:24.1333751Z',
 'keyVault': '/subscriptions/7a12af0d-a67c-4fb2-85ce-acb1b512790a/resourceGroups/MLOps/providers/Microsoft.Keyvault/vaults/mlopsws6971472015',
 'applicationInsights': '/subscriptions/7a12af0d-a67c-4fb2-85ce-acb1b512790a/resourceGroups/MLOps/providers/Microsoft.insights/components/mlopsws3340088976',
 'storageAccount': '/subscriptions/7a12af

## 2 - Random Forest Classifier

In [53]:
myexperiment = Experiment(workspace, "random-forest-classifier")  # for monitoring and logging

mlflow.set_experiment("mlflow-random-forest-classifier")

2023/11/03 15:25:26 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-random-forest-classifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1699025126540, experiment_id='8b12e835-fbb9-4a77-8329-88aad6ce4991', last_update_time=None, lifecycle_stage='active', name='mlflow-random-forest-classifier', tags={}>

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [58]:
# initialize the Azureml and mlflow

run = myexperiment.start_logging()
mlflow.start_run()

run.log("dataset name", dataset.name)
run.log("dataset version", dataset.version)

In [59]:
rf.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [60]:
run.log("max_depth", 10)
run.log("random_state", 0)
run.log("n_enstimators", 100)

In [61]:
predicted_rf = rf.predict(xtest)

In [62]:
acc = accuracy_score(ytest, predicted_rf)
precision = precision_score(ytest, predicted_rf)
recall = recall_score(ytest, predicted_rf)

In [63]:
run.log("Test accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)

In [64]:
run.complete()

print("Run ID", run.id)

Run ID 1e545301-aede-4a26-b99f-601a3c33a676


In [65]:
mlflow.end_run()

In [66]:
run.get_details()

{'runId': '1e545301-aede-4a26-b99f-601a3c33a676',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2023-11-03T15:27:57.838347Z',
 'endTimeUtc': '2023-11-03T15:31:11.46767Z',
 'services': {},
 'properties': {'ContentSnapshotId': 'e9fb45c8-9b1e-4652-a76d-3006293b8ffd'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {},
 'submittedBy': 'Himanshu Ramchandani'}

In [67]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset version': 1,
 'max_depth': 10,
 'n_enstimators': 100,
 'random_state': 0,
 'Precision': 0.9706878774488142,
 'Recall': 0.9753570635684156,
 'Test accuracy': 0.9526308968377397}

# Model Packaging

compantability and interoperability issues

In [68]:
# ONNX - Open Neural Network Exchange

# convert SVC model into ONNX format file

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_input', FloatTensorType([None, 6]))] # doubt

onx = convert_sklearn(svc, initial_types=initial_type)

with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.


In [69]:
# ONNX - Open Neural Network Exchange

# convert rf model into ONNX format file

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_input', FloatTensorType([None, 6]))] # doubt

onx = convert_sklearn(rf, initial_types=initial_type)

with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 9.
The maximum opset needed by this model is only 1.


# Model Registering

In [71]:
pd.__version__

'1.1.5'

In [72]:
# Register to Azure WS

model = Model.register(model_path='./outputs/svc.onnx',
                      model_name='support-vector-classifier',
                      tags={ 'dataset':dataset.name,
                            'version': dataset.version,
                            'hyperparameter-C': '1',
                            'testdata-accuracy': '0.9496'
                      },
                       model_framework='pandas==1.1.5',
                       description='Support vector classifier to predict weather',
                       workspace=workspace)

print('Name', model.name)
print('Version', model.version)

Registering model support-vector-classifier
Name support-vector-classifier
Version 1


In [73]:
# Register to Azure WS

model = Model.register(model_path='./outputs/rf.onnx',
                      model_name='random-forest-classifier',
                      tags={ 'dataset':dataset.name,
                            'version': dataset.version,
                            'hyperparameter-C': '1',
                            'testdata-accuracy': '0.9526'
                      },
                       model_framework='pandas==1.1.5',
                       description='Random Forest classifier to predict weather',
                       workspace=workspace)

print('Name', model.name)
print('Version', model.version)

Registering model random-forest-classifier
Name random-forest-classifier
Version 1
