# Linear Regression (Sklearn)
This tutorial shows how to use Predict on a Sklearn model.
Duration 00:02:38
# Train SKLearn Model

In [1]:
import fsspec
import pandas

fsspec_handle = fsspec.open('abfss://wplushiramsynapsefs@wplushiramsynapseadlsv2.dfs.core.windows.net/LengthOfStay_cooked_small.csv')

with fsspec_handle.open() as f:
    train_df = pandas.read_csv(f)

StatementMeta(threemid, 0, 1, Finished, Available)

In [2]:
train_df.head()

StatementMeta(threemid, 0, 2, Finished, Available)

Unnamed: 0,hematocrit,neutrophils,sodium,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,number_of_issues,asthma,depress,dialysisrenalendstage,fibrosisandother,gender,hemo,irondef,malnutrition,pneum,psychologicaldisordermajor,psychother,secondarydiagnosisnonicd9,substancedependence,rcount,lengthofstay
0,-0.042506,-0.144044,0.221213,-1.061915,-0.153676,-0.102902,0.880066,0.055739,0.017535,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,1
1,0.303138,1.782192,-0.568358,-0.479546,-0.3039,-1.185602,0.735433,1.092542,0.017535,0,0,0,0,0,0,0,0,0,0,0,0,3,0,4,7
2,-0.289395,-0.384824,-1.688322,0.589228,0.07166,-0.818583,0.625518,0.055739,-1.926099,0,0,0,0,0,1,0,0,0,0,0,0,9,0,1,2
3,-0.042506,-0.144044,-0.823302,0.366311,-0.153676,0.449993,-0.762316,0.055739,0.017535,0,0,0,0,0,1,0,0,0,0,0,0,3,0,1,2
4,-0.486907,0.244907,-0.312522,-1.3905,-0.153676,1.207098,-0.448994,-1.326665,0.017535,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2


In [3]:
import os
import shutil
import mlflow
import json
from mlflow.utils import model_utils

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

StatementMeta(threemid, 0, 3, Finished, Available)

In [4]:
class LinearRegressionModel():
  _ARGS_FILENAME = 'args.json'
  FEATURES_KEY = 'features'
  TARGETS_KEY = 'targets'
  TARGETS_PRED_KEY = 'targets_pred'

  def __init__(self, fit_intercept, nb_input_features=9, nb_output_features=1):
    self.fit_intercept = fit_intercept
    self.nb_input_features = nb_input_features
    self.nb_output_features = nb_output_features

  def get_args(self):
    args = {
        'nb_input_features': self.nb_input_features,
        'nb_output_features': self.nb_output_features,
        'fit_intercept': self.fit_intercept
    }
    return args

  def create_model(self):
    self.model = LinearRegression(fit_intercept=self.fit_intercept)

  def train(self, dataset):

    features = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.FEATURES_KEY])], axis=0)

    targets = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.TARGETS_KEY])], axis=0)


    self.model.fit(features, targets)

  def predict(self, dataset):
    features = np.stack([sample for sample in iter(
        dataset[LinearRegressionModel.FEATURES_KEY])], axis=0)
    targets_pred = self.model.predict(features)
    return targets_pred

  def save(self, path):
    if os.path.exists(path):
      shutil.rmtree(path)

    # save the sklearn model with mlflow
    mlflow.sklearn.save_model(self.model, path)

    # save args
    self._save_args(path)

  def _save_args(self, path):
    args_filename = os.path.join(path, LinearRegressionModel._ARGS_FILENAME)
    with open(args_filename, 'w') as f:
      args = self.get_args()
      json.dump(args, f)

StatementMeta(threemid, 0, 4, Finished, Available)

In [5]:
def train(train_df, output_model_path):
  print(f"Start to train LinearRegressionModel.")

  # Initialize input dataset
  dataset = train_df.to_numpy()
  datasets = {}
  datasets['targets'] = dataset[:, -1]
  datasets['features'] = dataset[:, :9]

  # Initialize model class obj
  model_class = LinearRegressionModel(fit_intercept=10)
  with mlflow.start_run(nested=True) as run:
    model_class.create_model()
    model_class.train(datasets)
    model_class.save(output_model_path)
    print(model_class.predict(datasets))

StatementMeta(threemid, 0, 5, Finished, Available)

In [6]:
train(train_df, './artifacts/output')

StatementMeta(threemid, 0, 6, Finished, Available)

Start to train LinearRegressionModel.
[1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2. 2. 1. 7. 2. 2.
 2. 1. 7. 2. 2. 2.]

# Upload to ADLS

In [7]:
import fsspec
import pandas 
from fsspec.core import split_protocol

STORAGE_PATH = 'abfs://wplushiramsynapsefs/predict/models/mlflow/sklearn/e2e_linear_regression/'

protocol, _ = split_protocol(STORAGE_PATH)
print (protocol)

fs = fsspec.filesystem(protocol)
fs.put(
    './artifacts/output',
    STORAGE_PATH, 
    recursive=True, overwrite=True)

StatementMeta(threemid, 0, 7, Finished, Available)

abfs


[None, None, None, None, None, None]

# Import SynapseML Predict

In [8]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf,udf,lit

import azure.synapse.ml.predict as pcontext
import azure.synapse.ml.predict.utils._logger as synapse_predict_logger

print(pcontext.__version__)

StatementMeta(threemid, 0, 8, Submitted, Running)

# Set some input parameters
Model and Data are both stored on ADLS. Must use full abfss path, not the mount.

Return type is int

In [None]:
DATA_FILE = "abfss://wplushiramsynapsefs@wplushiramsynapseadlsv2.dfs.core.windows.net/LengthOfStay_cooked_small.csv"
ADLS_MODEL_URI_SKLEARN = "abfss://wplushiramsynapsefs@wplushiramsynapseadlsv2.dfs.core.windows.net/predict/models/mlflow/sklearn/e2e_linear_regression/"
RETURN_TYPES = "INT"

StatementMeta(, , , Waiting, )

# Enable SynapseML predict
Set the spark conf spark.synapse.ml.predict.enabled as true to enable the library.

In [None]:
spark.conf.set("spark.synapse.ml.predict.enabled","true")

StatementMeta(, , , Waiting, )

# Bind Model

In [None]:
model = pcontext.bind_model(RETURN_TYPES, "mlflow", "sklearn_linear_regression", ADLS_MODEL_URI_SKLEARN).register()

StatementMeta(, , , Waiting, )

# Load Data

In [None]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .csv(DATA_FILE,
        inferSchema=True)
df = df.select(df.columns[:9])
df.createOrReplaceTempView('data')
df.show(10)
df

StatementMeta(, , , Waiting, )

In [None]:
%%sql 
select * from data

StatementMeta(, , , Waiting, )

In [None]:
predictions = spark.sql(
                  """
                      SELECT PREDICT('sklearn_linear_regression', *) AS predict FROM data
                  """
              ).show()

StatementMeta(, , , Waiting, )

## Special Thanks
* Ajay Agarwal
* Tian Wei
* Nellie Gustafsson