# Linear Regression (Sklearn)

This tutorial shows how to use Predict on a Sklearn model.

## Train SKLearn Model

In [1]:
import fsspec
import pandas

adls_account_name = 'ajagarwdemoadlsg2' #Provide exact ADLS account name
adls_account_key = '####' #Provide exact ADLS account key

fsspec_handle = fsspec.open('abfs://ajagarwfs/predict/dataset/LengthOfStay_cooked_small.csv', account_name=adls_account_name, account_key=adls_account_key)

with fsspec_handle.open() as f:
    train_df = pandas.read_csv(f)

StatementMeta(demosprkpool, 44, 1, Finished, Available)

In [2]:
import os
import shutil
import mlflow
import json
from mlflow.utils import model_utils

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression


class LinearRegressionModel():
  _ARGS_FILENAME = 'args.json'
  FEATURES_KEY = 'features'
  TARGETS_KEY = 'targets'
  TARGETS_PRED_KEY = 'targets_pred'

  def __init__(self, fit_intercept, nb_input_features=9, nb_output_features=1):
    self.fit_intercept = fit_intercept
    self.nb_input_features = nb_input_features
    self.nb_output_features = nb_output_features

  def get_args(self):
    args = {
        'nb_input_features': self.nb_input_features,
        'nb_output_features': self.nb_output_features,
        'fit_intercept': self.fit_intercept
    }
    return args

  def create_model(self):
    self.model = LinearRegression(fit_intercept=self.fit_intercept)

  def train(self, dataset):

    features = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.FEATURES_KEY])], axis=0)

    targets = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.TARGETS_KEY])], axis=0)


    self.model.fit(features, targets)

  def predict(self, dataset):
    features = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.FEATURES_KEY])], axis=0)
    targets_pred = self.model.predict(features)
    return targets_pred

  def save(self, path):
    if os.path.exists(path):
      shutil.rmtree(path)

    # save the sklearn model with mlflow
    mlflow.sklearn.save_model(self.model, path)

    # save args
    self._save_args(path)

  def _save_args(self, path):
    args_filename = os.path.join(path, LinearRegressionModel._ARGS_FILENAME)
    with open(args_filename, 'w') as f:
      args = self.get_args()
      json.dump(args, f)


def train(train_df, output_model_path):
  print(f"Start to train LinearRegressionModel.")

  # Initialize input dataset
  dataset = train_df.to_numpy()
  datasets = {}
  datasets['targets'] = dataset[:, -1]
  datasets['features'] = dataset[:, :9]

  # Initialize model class obj
  model_class = LinearRegressionModel(fit_intercept=10)
  with mlflow.start_run(nested=True) as run:
    model_class.create_model()
    model_class.train(datasets)
    model_class.save(output_model_path)
    print(model_class.predict(datasets))


train(train_df, './artifacts/output')

StatementMeta(demosprkpool, 44, 2, Finished, Available)

Start to train LinearRegressionModel.
[1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2.
 2. 1. 7. 2. 2. 2.]

## Upload to ADLS

In [3]:
import fsspec
import pandas 
from fsspec.core import split_protocol

STORAGE_PATH = 'abfs://ajagarwfs/predict/models/mlflow/sklearn/e2e_linear_regression/'

protocol, _ = split_protocol(STORAGE_PATH)
print (protocol)
adls_account_name = 'ajagarwdemoadlsg2' #Provide exact ADLS account name
adls_account_key = '####' #Provide exact ADLS account key
storage_options = {
    'account_name': adls_account_name,
    'account_key': adls_account_key
}
fs = fsspec.filesystem(protocol, **storage_options)
fs.put(
    './artifacts/output',
    STORAGE_PATH, 
    recursive=True, overwrite=True)

StatementMeta(demosprkpool, 44, 3, Finished, Available)

abfs


[None, None, None, None, None, None]

### Import SynapseML Predict

In [4]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf,udf,lit

import azure.synapse.ml.predict as pcontext
import azure.synapse.ml.predict.utils._logger as synapse_predict_logger

print(pcontext.__version__)

StatementMeta(demosprkpool, 44, 4, Finished, Available)

1.0.0

### Set some input parameters
<p>Model and Data are both stored on ADLS<p>
<p>Return type is int<p>

In [5]:
DATA_FILE = "abfss://ajagarwfs@ajagarwdemoadlsg2.dfs.core.windows.net/predict/dataset/LengthOfStay_cooked_small.csv"
ADLS_MODEL_URI_SKLEARN = "abfss://ajagarwfs@ajagarwdemoadlsg2.dfs.core.windows.net/predict/models/mlflow/sklearn/e2e_linear_regression/"
RETURN_TYPES = "INT"

StatementMeta(demosprkpool, 44, 5, Finished, Available)

### Enable SynapseML predict
Set the spark conf spark.synapse.ml.predict.enabled as true to enable the library.

In [6]:
spark.conf.set("spark.synapse.ml.predict.enabled","true")

StatementMeta(demosprkpool, 44, 6, Finished, Available)

### Bind Model

In [7]:
model = pcontext.bind_model(RETURN_TYPES, "mlflow", "sklearn_linear_regression", ADLS_MODEL_URI_SKLEARN).register()

StatementMeta(demosprkpool, 44, 7, Finished, Available)

### Load Data

In [8]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .csv(DATA_FILE,
        inferSchema=True)
df = df.select(df.columns[:9])
df.createOrReplaceTempView('data')
df.show(10)
df

StatementMeta(demosprkpool, 44, 8, Finished, Available)

+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|         hematocrit|       neutrophils|            sodium|          glucose|    bloodureanitro|        creatinine|               bmi|             pulse|       respiration|
+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|-0.0425062678333053|-0.144044124667474|  0.22121278292454|-1.06191476900289|-0.153676232509828|-0.102901720870172| 0.880066129351256|0.0557393384330164|0.0175353508059786|
|  0.303138384392267|  1.78219153507209|-0.568357686732667|  -0.479546395875|-0.303900204562006| -1.18560153346413| 0.735433409953405|  1.09254240594927|0.0175353508059786|
| -0.289395305137286| -0.38482358213492| -1.68832178164949|0.589228275750129|0.0716597255684404|-0.818583210523663| 0.625518172645604|0

DataFrame[hematocrit: double, neutrophils: double, sodium: double, glucose: double, bloodureanitro: double, creatinine: double, bmi: double, pulse: double, respiration: double]

In [9]:
spark.sql(
    """
        select * from data
    """
).show()

StatementMeta(demosprkpool, 44, 9, Finished, Available)

+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|         hematocrit|       neutrophils|            sodium|          glucose|    bloodureanitro|        creatinine|               bmi|             pulse|       respiration|
+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|-0.0425062678333053|-0.144044124667474|  0.22121278292454|-1.06191476900289|-0.153676232509828|-0.102901720870172| 0.880066129351256|0.0557393384330164|0.0175353508059786|
|  0.303138384392267|  1.78219153507209|-0.568357686732667|  -0.479546395875|-0.303900204562006| -1.18560153346413| 0.735433409953405|  1.09254240594927|0.0175353508059786|
| -0.289395305137286| -0.38482358213492| -1.68832178164949|0.589228275750129|0.0716597255684404|-0.818583210523663| 0.625518172645604|0

### Model Prediction using SPARK_SQL

In [10]:
predictions = spark.sql(
                  """
                      SELECT PREDICT('sklearn_linear_regression', *) AS predict FROM data
                  """
              ).show()

StatementMeta(demosprkpool, 44, 10, Finished, Available)

+-------+
|predict|
+-------+
|      1|
|      7|
|      1|
|      1|
|      1|
|      1|
|      7|
|      1|
|      1|
|      1|
|      1|
|      7|
|      1|
|      1|
|      1|
|      1|
|      7|
|      1|
|      1|
|      1|
+-------+
only showing top 20 rows