Based on: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-create-a-neural-network-for-regression-with-pytorch.md

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f81fec06290>

In [3]:
X, y = fetch_california_housing(return_X_y=True)

In [4]:
class HousingDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Apply scaling if necessary
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X.astype(np.float32))
            self.y = torch.from_numpy(y.astype(np.float32))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [5]:
dataset = HousingDataset(X, y)
trainloader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=True, num_workers=1)

In [6]:
next(iter(trainloader))

[tensor([[ 0.4159,  0.3465,  0.2294, -0.1113, -0.3130,  0.0527, -0.5955,  0.3193],
         [-0.3706,  0.5849, -0.1514, -0.2040,  0.3943,  0.0550, -0.8155,  0.6687],
         [-0.2024, -0.8454,  0.1928,  0.0087,  0.5435, -0.0279,  0.9449, -1.2679],
         [ 0.1064, -1.9578,  0.2968,  0.0363,  2.7556, -0.0217, -0.4925,  0.7685],
         [ 0.1057,  1.0616,  0.1675, -0.0081, -0.3651, -0.0372, -0.6751,  0.7186],
         [ 0.3343, -1.4811, -0.7187,  0.2041,  0.0967,  0.0529, -0.8483,  0.8234],
         [-0.2691,  1.3000, -0.6491, -0.0872,  1.7074, -0.1372, -0.7312,  0.6088],
         [-0.0891, -0.3686,  0.0260, -0.1563,  0.3996,  0.0449,  1.1790, -1.3378],
         [ 0.1397, -0.2097,  0.1816, -0.1881,  0.4049, -0.0229, -0.7547,  1.2676],
         [-0.3399,  0.3465, -0.4621, -0.0519,  0.6115, -0.0284, -0.6704,  0.5189]]),
 tensor([1.8790, 1.1770, 3.3160, 1.5430, 2.3400, 1.5240, 2.8750, 1.2360, 1.2100,
         2.0530])]

In [7]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [8]:
# Initialize the MLP
mlp = MLP()

# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

In [9]:
# Run the training loop
for epoch in range(0, 5):  # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    current_loss = 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):

        # Get and prepare inputs
        inputs, targets = data
        targets = targets.reshape((targets.shape[0], 1))

        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        outputs = mlp(inputs)

        # Compute loss
        loss = loss_function(outputs, targets)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        current_loss += loss.item()
        if i % 200 == 0:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 500))
            current_loss = 0.0

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Loss after mini-batch     1: 0.004
Loss after mini-batch   201: 0.733
Loss after mini-batch   401: 0.534
Loss after mini-batch   601: 0.403
Loss after mini-batch   801: 0.330
Loss after mini-batch  1001: 0.269
Loss after mini-batch  1201: 0.232
Loss after mini-batch  1401: 0.226
Loss after mini-batch  1601: 0.223
Loss after mini-batch  1801: 0.214
Loss after mini-batch  2001: 0.214
Starting epoch 2
Loss after mini-batch     1: 0.002
Loss after mini-batch   201: 0.211
Loss after mini-batch   401: 0.205
Loss after mini-batch   601: 0.199
Loss after mini-batch   801: 0.192
Loss after mini-batch  1001: 0.194
Loss after mini-batch  1201: 0.196
Loss after mini-batch  1401: 0.193
Loss after mini-batch  1601: 0.194
Loss after mini-batch  1801: 0.187
Loss after mini-batch  2001: 0.197
Starting epoch 3
Loss after mini-batch     1: 0.001
Loss after mini-batch   201: 0.190
Loss after mini-batch   401: 0.183
Loss after mini-batch   601: 0.181
Loss after mini-batch   801: 0.190
Loss

### Save Model

In [10]:
torch.save(mlp, "housing_model.pt")

### Load and Test Model

In [11]:
loaded_mlp = torch.load("housing_model.pt")

In [12]:
testX, testY = next(iter(trainloader))

In [13]:
loaded_mlp(testX).flatten()

tensor([2.8778, 0.6233, 3.9021, 2.4543, 1.0209, 1.8093, 1.4593, 3.2933, 2.9263,
        1.4790], grad_fn=<ReshapeAliasBackward0>)

In [14]:
testY

tensor([2.8380, 0.5740, 5.0000, 2.0430, 1.2680, 1.8380, 1.4340, 2.6830, 1.6100,
        1.3050])

### Columns as separate input variables

In [15]:
import numpy as np
import pandas as pd
import torch

from inspect import signature
from torch import nn
from torch.utils.data import DataLoader
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [16]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f81fec06290>

In [17]:
housing = fetch_california_housing()

In [18]:
class HousingDataset2(torch.utils.data.Dataset):
    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Apply scaling if necessary
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X.astype(np.float32))
            self.y = torch.from_numpy(y.astype(np.float32))
            
            # Split dataset into separate variables
            self.MedInc = self.X[:,0]
            self.HouseAge = self.X[:,1]
            self.AveRooms = self.X[:,2]
            self.AveBedrms = self.X[:,3]
            self.Population = self.X[:,4]
            self.AveOccup = self.X[:,5]
            self.Latitude = self.X[:,6]
            self.Longitude = self.X[:,7]

    def __len__(self):
        return len(self.MedInc)

    def __getitem__(self, i):
        # Note: also returning combined X for ease of use later
        return self.MedInc[i], self.HouseAge[i], self.AveRooms[i], self.AveBedrms[i], self.Population[i], self.AveOccup[i], self.Latitude[i], self.Longitude[i], self.y[i]

In [19]:
dataset2 = HousingDataset2(housing.data, housing.target)
trainloader2 = torch.utils.data.DataLoader(dataset2, batch_size=10, shuffle=True, num_workers=1)

In [20]:
next(iter(trainloader2))

[tensor([ 0.4159, -0.3706, -0.2024,  0.1064,  0.1057,  0.3343, -0.2691, -0.0891,
          0.1397, -0.3399]),
 tensor([ 0.3465,  0.5849, -0.8454, -1.9578,  1.0616, -1.4811,  1.3000, -0.3686,
         -0.2097,  0.3465]),
 tensor([ 0.2294, -0.1514,  0.1928,  0.2968,  0.1675, -0.7187, -0.6491,  0.0260,
          0.1816, -0.4621]),
 tensor([-0.1113, -0.2040,  0.0087,  0.0363, -0.0081,  0.2041, -0.0872, -0.1563,
         -0.1881, -0.0519]),
 tensor([-0.3130,  0.3943,  0.5435,  2.7556, -0.3651,  0.0967,  1.7074,  0.3996,
          0.4049,  0.6115]),
 tensor([ 0.0527,  0.0550, -0.0279, -0.0217, -0.0372,  0.0529, -0.1372,  0.0449,
         -0.0229, -0.0284]),
 tensor([-0.5955, -0.8155,  0.9449, -0.4925, -0.6751, -0.8483, -0.7312,  1.1790,
         -0.7547, -0.6704]),
 tensor([ 0.3193,  0.6687, -1.2679,  0.7685,  0.7186,  0.8234,  0.6088, -1.3378,
          1.2676,  0.5189]),
 tensor([1.8790, 1.1770, 3.3160, 1.5430, 2.3400, 1.5240, 2.8750, 1.2360, 1.2100,
         2.0530])]

In [21]:
class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [22]:
# Initialize the MLP
mlp2 = MLP2()

In [23]:
# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp2.parameters(), lr=1e-4)

In [24]:
# Run the training loop
for epoch in range(0, 5):  # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    current_loss = 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader2, 0):

        # Get and prepare inputs
        a,b,c,d,e,f,g,h,targets = data
        targets = targets.reshape((targets.shape[0], 1))

        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        outputs = mlp2(a,b,c,d,e,f,g,h)

        # Compute loss
        loss = loss_function(outputs, targets)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        current_loss += loss.item()
        if i % 200 == 0:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 500))
            current_loss = 0.0

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Loss after mini-batch     1: 0.004
Loss after mini-batch   201: 0.733
Loss after mini-batch   401: 0.534
Loss after mini-batch   601: 0.403
Loss after mini-batch   801: 0.330
Loss after mini-batch  1001: 0.269
Loss after mini-batch  1201: 0.232
Loss after mini-batch  1401: 0.226
Loss after mini-batch  1601: 0.223
Loss after mini-batch  1801: 0.214
Loss after mini-batch  2001: 0.214
Starting epoch 2
Loss after mini-batch     1: 0.002
Loss after mini-batch   201: 0.211
Loss after mini-batch   401: 0.205
Loss after mini-batch   601: 0.199
Loss after mini-batch   801: 0.192
Loss after mini-batch  1001: 0.194
Loss after mini-batch  1201: 0.196
Loss after mini-batch  1401: 0.193
Loss after mini-batch  1601: 0.194
Loss after mini-batch  1801: 0.187
Loss after mini-batch  2001: 0.197
Starting epoch 3
Loss after mini-batch     1: 0.001
Loss after mini-batch   201: 0.190
Loss after mini-batch   401: 0.183
Loss after mini-batch   601: 0.181
Loss after mini-batch   801: 0.190
Loss

In [25]:
torch.save(mlp2, "housing_model2.pt")

In [26]:
a,b,c,d,e,f,g,h,targets = next(iter(trainloader2))

In [27]:
mlp2(a,b,c,d,e,f,g,h)

tensor([[2.8778],
        [0.6233],
        [3.9021],
        [2.4543],
        [1.0209],
        [1.8093],
        [1.4593],
        [3.2933],
        [2.9263],
        [1.4790]], grad_fn=<AddmmBackward0>)

In [28]:
print(signature(mlp2.forward))

(inc, age, rms, bdrms, pop, occup, lat, lon)


## PySpark

### Convert dataset to Spark DataFrame

In [29]:
housing = fetch_california_housing()

In [30]:
X = StandardScaler().fit_transform(housing.data.astype(np.float32))

In [31]:
pdf = pd.DataFrame(X, columns=housing.feature_names)
pdf

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052549,-1.327837
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322845
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038502,-1.332825
3,0.932967,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038502,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038502,-1.337818
...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758824
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818721
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071734,1.778238,-0.823714
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778238,-0.873626


In [32]:
pdf.dtypes

MedInc        float32
HouseAge      float32
AveRooms      float32
AveBedrms     float32
Population    float32
AveOccup      float32
Latitude      float32
Longitude     float32
dtype: object

In [33]:
from pyspark.sql.types import *

# Spark is somehow auto-converting Pandas float32 to DoubleType(), so forcing FloatType()
schema = StructType([
StructField("MedInc",FloatType(),True),
StructField("HouseAge",FloatType(),True),
StructField("AveRooms",FloatType(),True),
StructField("AveBedrms",FloatType(),True),
StructField("Population",FloatType(),True),
StructField("AveOccup",FloatType(),True),
StructField("Latitude",FloatType(),True),
StructField("Longitude",FloatType(),True)
])

df = spark.createDataFrame(pdf, schema=schema)

In [34]:
df.schema

StructType(List(StructField(MedInc,FloatType,true),StructField(HouseAge,FloatType,true),StructField(AveRooms,FloatType,true),StructField(AveBedrms,FloatType,true),StructField(Population,FloatType,true),StructField(AveOccup,FloatType,true),StructField(Latitude,FloatType,true),StructField(Longitude,FloatType,true)))

In [35]:
df.show(truncate=12)

[Stage 0:>                                                          (0 + 1) / 1]

+------------+----------+------------+------------+-----------+------------+---------+----------+
|      MedInc|  HouseAge|    AveRooms|   AveBedrms| Population|    AveOccup| Latitude| Longitude|
+------------+----------+------------+------------+-----------+------------+---------+----------+
|    2.344766| 0.9821427|  0.62855947| -0.15375753|-0.97442853|-0.049596533|1.0525488|-1.3278369|
|   2.3322382|-0.6070189|  0.32704142| -0.26333576|  0.8614389| -0.09251223|1.0431849|-1.3228445|
|   1.7826993| 1.8561815|   1.1556205|-0.049016476|-0.82077736|-0.025842525| 1.038502|-1.3328254|
|  0.93296736| 1.8561815|  0.15696616|-0.049833003|-0.76602805|-0.050329294| 1.038502|-1.3378178|
|-0.012881001| 1.8561815|  0.34471077|-0.032905966| -0.7598467| -0.08561575| 1.038502|-1.3378178|
| 0.087446585| 1.8561815| -0.26972958| 0.014669393|-0.89407074|-0.089618415| 1.038502|-1.3378178|
| -0.11136628| 1.8561815| -0.20091766| -0.30663314| -0.2927116| -0.09072491|1.0338209|-1.3378178|
| -0.39513668| 1.856

                                                                                

### Save DataFrame as parquet

In [36]:
df.write.mode("overwrite").parquet("california_housing")

                                                                                

## Inference using Spark ML Model
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [37]:
import sparkext
import torch

In [38]:
df = spark.read.parquet("california_housing")

In [39]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [40]:
from torch import nn

class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [41]:
my_model = sparkext.torch.Model("housing_model2.pt") \
                .setInputCols(columns) \
                .setOutputCol("preds")

In [42]:
predictions = my_model.transform(df)

Loading model on driver from housing_model2.pt
ModelSummary(num_params=2689, inputs=[TensorSummary(shape=[64, 8], dtype=torch.float32, name=None)], outputs=[TensorSummary(shape=[1], dtype=torch.float32, name=None)]) -> array<float>


In [43]:
predictions.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------------+-----------+------------+------------+-----------+-------------+----------+----------+-----------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|  Latitude| Longitude|      preds|
+------------+-----------+------------+------------+-----------+-------------+----------+----------+-----------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235| 0.0066712764|-0.8576533| 0.7934686|[2.5334318]|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344|  0.037127122|-0.8623344| 0.7934686|[1.9301689]|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716|  0.064333126|-0.8576533| 0.7934686|[1.9071498]|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606|  0.047397293|-0.8576533|  0.798461|[2.4639113]|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083| -0.018044483|-0.8623344|  0.798461|[1.8513522]|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|   -0.0268871|-0.8623344| 0.7834

                                                                                

## Inference using Spark DL UDF
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [44]:
import torch

from pyspark.sql.functions import col
from sparkext.torch import model_udf

In [45]:
df = spark.read.parquet("california_housing")

In [46]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

### Using Saved Model

Since the model is pickled, the model class must be defined before loading.

In [47]:
from torch import nn

class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, inc, age, rms, bdrms, pop, occup, lat, lon):       
        combined = torch.column_stack((inc, age, rms, bdrms, pop, occup, lat, lon))
        return self.layers(combined)

In [48]:
classify = model_udf("housing_model2.pt", input_columns=columns)

Loading model on driver from housing_model2.pt
ModelSummary(num_params=2689, inputs=[TensorSummary(shape=[64, 8], dtype=torch.float32, name=None)], outputs=[TensorSummary(shape=[1], dtype=torch.float32, name=None)]) -> array<float>


In [49]:
predictions = df.withColumn("preds", classify(*columns))

In [50]:
%%time
preds = predictions.collect()



CPU times: user 88.5 ms, sys: 11.5 ms, total: 100 ms
Wall time: 7.25 s


                                                                                

In [51]:
predictions.show(truncate=12)

+------------+-----------+------------+------------+-----------+------------+----------+----------+-----------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|    AveOccup|  Latitude| Longitude|      preds|
+------------+-----------+------------+------------+-----------+------------+----------+----------+-----------+
|  0.85111564|  0.5053942|  0.29677927| -0.21630338| -0.3077235|0.0066712764|-0.8576533| 0.7934686|[2.5334318]|
|  0.12676695|  0.5053942| -0.21044567| -0.21279162| 0.20621344| 0.037127122|-0.8623344| 0.7934686|[1.9301689]|
|  0.22788419| 0.26701993|-0.021574577|-0.046525124| 0.06845716| 0.064333126|-0.8576533| 0.7934686|[1.9071498]|
|  0.83821946|  0.5053942|  0.46901828| -0.10617764| 0.16647606| 0.047397293|-0.8576533|  0.798461|[2.4639113]|
| -0.09778573| 0.34647804| 0.040665437|-0.025475439| 0.24595083|-0.018044483|-0.8623344|  0.798461|[1.8513522]|
|-0.114524566| -0.8453931|  0.08634951| -0.06806936|   0.348385|  -0.0268871|-0.8623344| 0.7834877|[1.77