Based on: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-create-a-neural-network-for-regression-with-pytorch.md

In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [None]:
torch.manual_seed(42)

In [None]:
X, y = fetch_california_housing(return_X_y=True)

In [None]:
class HousingDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Apply scaling if necessary
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X.astype(np.float32))
            self.y = torch.from_numpy(y.astype(np.float32))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [None]:
dataset = HousingDataset(X, y)
trainloader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=True, num_workers=1)

In [None]:
next(iter(trainloader))

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
# Initialize the MLP
mlp = MLP()

# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

In [None]:
# Run the training loop
for epoch in range(0, 5):  # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    current_loss = 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):

        # Get and prepare inputs
        inputs, targets = data
        targets = targets.reshape((targets.shape[0], 1))

        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        outputs = mlp(inputs)

        # Compute loss
        loss = loss_function(outputs, targets)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        current_loss += loss.item()
        if i % 200 == 0:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 500))
            current_loss = 0.0

# Process is complete.
print('Training process has finished.')

### Save Model

In [None]:
torch.save(mlp, "housing_model.pt")

In [None]:
scripted = torch.jit.script(mlp)
scripted.save("housing_model.ts")

### Load and Test Model

In [None]:
loaded_mlp = torch.load("housing_model.pt")

In [None]:
testX, testY = next(iter(trainloader))

In [None]:
loaded_mlp(testX)

In [None]:
testY

In [None]:
scripted_mlp = torch.jit.load("housing_model.ts")

In [None]:
scripted_mlp(testX).flatten()

### Columns as separate input variables

In [None]:
import numpy as np
import pandas as pd
import torch

from inspect import signature
from torch import nn
from torch.utils.data import DataLoader
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [None]:
torch.manual_seed(42)

In [None]:
housing = fetch_california_housing()

In [None]:
class HousingDataset2(torch.utils.data.Dataset):
    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Apply scaling if necessary
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X.astype(np.float32))
            self.y = torch.from_numpy(y.astype(np.float32))
            
            # Split dataset into separate variables
            self.MedInc = self.X[:,0]
            self.HouseAge = self.X[:,1]
            self.AveRooms = self.X[:,2]
            self.AveBedrms = self.X[:,3]
            self.Population = self.X[:,4]
            self.AveOccup = self.X[:,5]
            self.Latitude = self.X[:,6]
            self.Longitude = self.X[:,7]

    def __len__(self):
        return len(self.MedInc)

    def __getitem__(self, i):
        # Note: also returning combined X for ease of use later
        return self.MedInc[i], self.HouseAge[i], self.AveRooms[i], self.AveBedrms[i], self.Population[i], self.AveOccup[i], self.Latitude[i], self.Longitude[i], self.y[i]

In [None]:
dataset2 = HousingDataset2(housing.data, housing.target)
trainloader2 = torch.utils.data.DataLoader(dataset2, batch_size=10, shuffle=True, num_workers=1)

In [None]:
next(iter(trainloader2))

In [None]:
class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [None]:
# Initialize the MLP
mlp2 = MLP2()

In [None]:
# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp2.parameters(), lr=1e-4)

In [None]:
# Run the training loop
for epoch in range(0, 5):  # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    current_loss = 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader2, 0):

        # Get and prepare inputs
        a,b,c,d,e,f,g,h,targets = data
        targets = targets.reshape((targets.shape[0], 1))

        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        outputs = mlp2(a,b,c,d,e,f,g,h)

        # Compute loss
        loss = loss_function(outputs, targets)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        current_loss += loss.item()
        if i % 200 == 0:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 500))
            current_loss = 0.0

# Process is complete.
print('Training process has finished.')

### Save Model

In [None]:
torch.save(mlp2, "housing_model2.pt")

In [None]:
scripted = torch.jit.script(mlp2)
scripted.save("housing_model2.ts")

### Load and Test Model

In [None]:
a,b,c,d,e,f,g,h,targets = next(iter(trainloader2))

In [None]:
loaded_mlp2 = torch.load("housing_model2.pt")

In [None]:
loaded_mlp2(a,b,c,d,e,f,g,h)

In [None]:
print(signature(loaded_mlp2.forward))

In [None]:
scripted_mlp2 = torch.jit.load("housing_model2.ts")

In [None]:
scripted_mlp2(a,b,c,d,e,f,g,h)

## PySpark

### Convert dataset to Spark DataFrame

In [None]:
housing = fetch_california_housing()

In [None]:
X = StandardScaler().fit_transform(housing.data.astype(np.float32))

In [None]:
pdf = pd.DataFrame(X, columns=housing.feature_names)
pdf

In [None]:
foo = pdf.to_dict('series')

In [None]:
foo.keys()

In [None]:
pdf.dtypes

In [None]:
from pyspark.sql.types import *

# Spark is somehow auto-converting Pandas float32 to DoubleType(), so forcing FloatType()
schema = StructType([
StructField("MedInc",FloatType(),True),
StructField("HouseAge",FloatType(),True),
StructField("AveRooms",FloatType(),True),
StructField("AveBedrms",FloatType(),True),
StructField("Population",FloatType(),True),
StructField("AveOccup",FloatType(),True),
StructField("Latitude",FloatType(),True),
StructField("Longitude",FloatType(),True)
])

df = spark.createDataFrame(pdf, schema=schema)

In [None]:
df.schema

In [None]:
df.show(truncate=12)

### Save DataFrame as parquet

In [None]:
df.write.mode("overwrite").parquet("california_housing")

## Inference using Spark ML Model
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import sparkext
import torch

In [None]:
df = spark.read.parquet("california_housing")

In [None]:
columns = df.columns
columns

In [None]:
from torch import nn

class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [None]:
my_model = sparkext.torch.Model("housing_model2.pt") \
                .setInputCols(columns) \
                .setOutputCol("preds")

In [None]:
predictions = my_model.transform(df)

In [None]:
predictions.show()

## Inference using Spark DL UDF
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import torch

from pyspark.sql.functions import col
from sparkext.torch import model_udf

In [None]:
df = spark.read.parquet("california_housing")

In [None]:
columns = df.columns
columns

### Using Saved Model

Since the model is pickled, the model class must be defined before loading.

In [None]:
from torch import nn

class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [None]:
classify = model_udf("housing_model2.pt", input_columns=columns)

In [None]:
predictions = df.withColumn("preds", classify(*columns))

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=12)

## Inference using Spark DL API
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [1]:
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import struct, col
from pyspark.sql.types import ArrayType, FloatType

In [2]:
df = spark.read.parquet("california_housing")

                                                                                

In [3]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [4]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+------------+-----------+------------+------------+-----------+-------------+----------+----------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|  Latitude| Longitude|
+------------+-----------+------------+------------+-----------+-------------+----------+----------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235| 0.0066712764|-0.8576533| 0.7934686|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344|  0.037127122|-0.8623344| 0.7934686|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716|  0.064333126|-0.8576533| 0.7934686|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606|  0.047397293|-0.8576533|  0.798461|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083| -0.018044483|-0.8623344|  0.798461|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|   -0.0268871|-0.8623344| 0.7834877|
| -0.17863737| -1.0837674|  -0.3686518| -0.09916091|  0.7722505| -0.061561517|-0.8670172|0.

                                                                                

### Using TorchScript Model (single input)

In [5]:
def predict_batch_fn():
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))
    
    scripted_mlp = torch.jit.load("/home/leey/devpub/leewyang/sparkext/examples/pytorch/housing_model.ts")
    scripted_mlp.to(device)
    
    def predict(inputs):
        torch_inputs = torch.from_numpy(inputs).to(device)
        outputs = scripted_mlp(torch_inputs) # .flatten()
        return outputs.detach().numpy()

    return predict

In [6]:
classify = predict_batch_udf(predict_batch_fn,
                             return_type=FloatType(),
                             batch_size=50)

In [7]:
%%time
# first pass caches model/fn
predictions = df.withColumn("preds", classify(struct(*columns)))
preds = predictions.collect()

[Stage 2:>                                                          (0 + 8) / 8]

CPU times: user 138 ms, sys: 9.12 ms, total: 148 ms
Wall time: 2.7 s


                                                                                

In [8]:
predictions.show()

+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|  Latitude| Longitude|    preds|
+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235| 0.0066712764|-0.8576533| 0.7934686|2.5334318|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344|  0.037127122|-0.8623344| 0.7934686|1.9301689|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716|  0.064333126|-0.8576533| 0.7934686|1.9071498|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606|  0.047397293|-0.8576533|  0.798461|2.4639113|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083| -0.018044483|-0.8623344|  0.798461|1.8513522|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|   -0.0268871|-0.8623344| 0.7834877|1.7754748|
|

In [9]:
%%time
# first pass caches model/fn
predictions = df.withColumn("preds", classify(*columns))
preds = predictions.collect()

22/09/21 12:52:41 WARN TaskSetManager: Lost task 6.0 in stage 4.0 (TID 17) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:108)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:52)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at s

[Stage 4:>                                                          (0 + 1) / 8]

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.


In [10]:
%%time
# first pass caches model/fn
predictions = df.withColumn("preds", classify(*[col(c) for c in columns]))
preds = predictions.collect()

22/09/21 12:52:42 WARN TaskSetManager: Lost task 5.0 in stage 4.0 (TID 16) (192.168.86.223 executor 0): TaskKilled (Stage cancelled)


[Stage 5:>                                                          (0 + 8) / 8]

22/09/21 12:52:43 WARN TaskSetManager: Lost task 6.0 in stage 5.0 (TID 25) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:108)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:52)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at s

[Stage 5:>                                                          (0 + 7) / 8]

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.


### Using TorchScript Model (separate input variables)

In [11]:
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import struct, col
from pyspark.sql.types import ArrayType, FloatType

In [12]:
df = spark.read.parquet("california_housing")

                                                                                

In [13]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [14]:
def predict_batch_fn():
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))
    scripted_mlp = torch.jit.load("/home/leey/devpub/leewyang/sparkext/examples/pytorch/housing_model2.ts")
    scripted_mlp.to(device)
    
    def predict(inc, age, rms, bdrms, pop, occ, lat, lon):
        # inputs = [inc, age, rms, bdrms, pop, occ, lat, lon]
        # torch_inputs = [torch.from_numpy(i).to(device) for i in inputs]
        # outputs = scripted_mlp(*torch_inputs) #.flatten()
        outputs = scripted_mlp(
            torch.from_numpy(inc).to(device),
            torch.from_numpy(age).to(device),
            torch.from_numpy(rms).to(device),
            torch.from_numpy(bdrms).to(device),
            torch.from_numpy(pop).to(device),
            torch.from_numpy(occ).to(device),
            torch.from_numpy(lat).to(device),
            torch.from_numpy(lon).to(device),
        ) # .flatten()
        return outputs.detach().numpy()

    return predict

In [15]:
classify = predict_batch_udf(predict_batch_fn,
                             return_type=FloatType(),
                             batch_size=50)

In [16]:
%%time
# first pass caches model/fn
predictions = df.withColumn("preds", classify(struct(*columns)))
preds = predictions.collect()

[Stage 7:>                                                          (0 + 8) / 8]

CPU times: user 152 ms, sys: 7.28 ms, total: 159 ms
Wall time: 1.46 s


                                                                                

In [17]:
%%time
predictions = df.withColumn("preds", classify(*columns))
preds = predictions.collect()

CPU times: user 54.9 ms, sys: 0 ns, total: 54.9 ms
Wall time: 479 ms


In [18]:
%%time
predictions = df.withColumn("preds", classify(*[col(c) for c in columns]))
preds = predictions.collect()

CPU times: user 137 ms, sys: 16.4 ms, total: 153 ms
Wall time: 444 ms


In [19]:
predictions.show()

+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|  Latitude| Longitude|    preds|
+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235| 0.0066712764|-0.8576533| 0.7934686|2.5334318|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344|  0.037127122|-0.8623344| 0.7934686|1.9301689|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716|  0.064333126|-0.8576533| 0.7934686|1.9071498|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606|  0.047397293|-0.8576533|  0.798461|2.4639113|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083| -0.018044483|-0.8623344|  0.798461|1.8513522|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|   -0.0268871|-0.8623344| 0.7834877|1.7754748|
|

### Using Triton Server

In [20]:
from functools import partial
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import struct, col
from pyspark.sql.types import ArrayType, FloatType

In [21]:
df = spark.read.parquet("california_housing")

In [22]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

#### Start Triton Server on each executor

In [23]:
num_executors = 1

nodeRDD = sc.parallelize(list(range(num_executors)), num_executors)

def start_triton(it):
    import docker
    import time
    import tritonclient.grpc as grpcclient
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    if containers:
        print(">>>> containers: {}".format([c.short_id for c in containers]))
    else:
        container=client.containers.run(
            "nvcr.io/nvidia/tritonserver:22.07-py3", "tritonserver --model-repository=/models",
            detach=True,
            device_requests=[docker.types.DeviceRequest(device_ids=["0"], capabilities=[['gpu']])],
            name="spark-triton",
            network_mode="host",
            remove=True,
            shm_size="64M",
            volumes={"/home/leey/devpub/leewyang/sparkext/examples/models": {"bind": "/models", "mode": "ro"}}
        )
        print(">>>> starting triton: {}".format(container.short_id))

        # wait for triton to be running
        time.sleep(15)
        client = grpcclient.InferenceServerClient("localhost:8001")
        ready = False
        while not ready:
            try:
                ready = client.is_server_ready()
            except Exception as e:
                time.sleep(5)
            
    return [True]

nodeRDD.mapPartitions(start_triton).collect()

                                                                                

[True]

In [24]:
def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool8),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name()].astype(np_types[i.datatype()]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # return dictionary of numpy arrays
            return {o.name: response.as_numpy(o.name) for o in model_meta.outputs}
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

In [25]:
classify = predict_batch_udf(partial(triton_fn, triton_uri="localhost:8001", model_name="housing_model"),
                             return_type=FloatType(),
                             batch_size=50)

In [26]:
%%time
# first pass caches model/fn
predictions = df.withColumn("preds", classify(struct(*columns)))
preds = predictions.collect()

[Stage 13:>                                                         (0 + 8) / 8]

CPU times: user 86 ms, sys: 10 ms, total: 96 ms
Wall time: 1.18 s


                                                                                

In [27]:
predictions.show()

+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|  Latitude| Longitude|    preds|
+------------+-----------+------------+------------+-----------+-------------+----------+----------+---------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235| 0.0066712764|-0.8576533| 0.7934686| 2.533432|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344|  0.037127122|-0.8623344| 0.7934686|1.9301689|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716|  0.064333126|-0.8576533| 0.7934686|1.9071497|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606|  0.047397293|-0.8576533|  0.798461|2.4639113|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083| -0.018044483|-0.8623344|  0.798461| 1.851352|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|   -0.0268871|-0.8623344| 0.7834877|1.7754749|
|

In [28]:
%%time
predictions = df.withColumn("preds", classify(*columns))
preds = predictions.collect()

22/09/21 12:53:25 WARN TaskSetManager: Lost task 6.0 in stage 15.0 (TID 70) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:108)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:52)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at 

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.


In [29]:
%%time
predictions = df.withColumn("preds", classify(*[col(c) for c in columns]))
preds = predictions.collect()

[Stage 15:>                                                         (0 + 1) / 8]

22/09/21 12:53:26 WARN TaskSetManager: Lost task 5.0 in stage 15.0 (TID 69) (192.168.86.223 executor 0): TaskKilled (Stage cancelled)
22/09/21 12:53:26 WARN TaskSetManager: Lost task 6.0 in stage 16.0 (TID 78) (192.168.86.223 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:108)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:52)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark

[Stage 16:>                                                         (0 + 1) / 8]

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/pyspark/ml/functions.py", line 303, in predict
    raise ValueError(
ValueError: Multiple input columns found, but model expected a single input, use `struct` or `array` to combine columns into tensors.


#### Stop Triton Server on each executor

In [30]:
def stop_triton(it):
    import docker
    import time
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    print(">>>> stopping containers: {}".format([c.short_id for c in containers]))
    if containers:
        container=containers[0]
        container.stop(timeout=120)

    return [True]

nodeRDD.mapPartitions(stop_triton).collect()

22/09/21 12:53:26 WARN TaskSetManager: Lost task 7.0 in stage 16.0 (TID 79) (192.168.86.223 executor 0): TaskKilled (Stage cancelled)


                                                                                

[True]

In [31]:
spark.stop()