In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the California housing dataset
data = fetch_california_housing()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Define the neural network model
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = Net(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1)%10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the trained model
torch.save(model.state_dict(), 'model.pth')

Epoch [10/100], Loss: 4.2936
Epoch [20/100], Loss: 3.4008
Epoch [30/100], Loss: 2.5683
Epoch [40/100], Loss: 1.8828
Epoch [50/100], Loss: 1.4255
Epoch [60/100], Loss: 1.1322
Epoch [70/100], Loss: 0.9406
Epoch [80/100], Loss: 0.8285
Epoch [90/100], Loss: 0.7669
Epoch [100/100], Loss: 0.7251


In [3]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8))

In [4]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,))

In [5]:
import pandas as pd

pdf = pd.DataFrame(X_test, columns=data.feature_names)
pdf

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.621343,-1.955549,0.565493,-0.094238,1.320148,-0.024522,1.382349,-1.575524
1,2.312259,1.855027,0.515777,-0.102445,-0.642364,-0.062410,0.844094,-1.285764
2,0.452247,0.029126,0.434669,-0.236373,-0.211312,-0.001183,-0.742591,1.127242
3,5.858342,1.855027,1.622906,-0.074516,-0.142975,-0.002936,-0.723869,0.582692
4,-0.089063,-1.002905,0.027598,-0.045546,0.965319,-0.002721,0.886218,-0.636301
...,...,...,...,...,...,...,...,...
4123,-0.035296,-0.288422,-0.346185,0.049532,0.083065,-0.095223,-0.798757,0.572700
4124,2.120833,0.664222,0.351040,-0.327799,-0.805323,-0.053764,0.853455,-1.280768
4125,-0.210343,1.855027,-0.049907,-0.002631,-0.394422,-0.088379,0.998550,-1.335722
4126,-0.805316,-1.399840,-0.821129,-0.013165,-0.198170,-0.105647,1.274698,-1.570528


In [6]:
from pyspark.sql.types import *

# Spark is somehow auto-converting Pandas float32 to DoubleType(), so forcing FloatType()
schema = StructType([
StructField("MedInc",FloatType(),True),
StructField("HouseAge",FloatType(),True),
StructField("AveRooms",FloatType(),True),
StructField("AveBedrms",FloatType(),True),
StructField("Population",FloatType(),True),
StructField("AveOccup",FloatType(),True),
StructField("Latitude",FloatType(),True),
StructField("Longitude",FloatType(),True)
])

df = spark.createDataFrame(pdf, schema=schema)
df.show(truncate=12)

[Stage 0:>                                                          (0 + 1) / 1]

+------------+-----------+------------+------------+-----------+------------+-----------+-----------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|    AveOccup|   Latitude|  Longitude|
+------------+-----------+------------+------------+-----------+------------+-----------+-----------+
|   0.6213434| -1.9555492|   0.5654933|-0.094237834|  1.3201483| -0.02452237|  1.3823495| -1.5755241|
|   2.3122587|   1.855027|  0.51577705| -0.10244485| -0.6423641|-0.062409732| 0.84409356| -1.2857636|
|  0.45224655|0.029125877|  0.43466917| -0.23637258|-0.21131223|-0.001183...| -0.7425912|  1.1272416|
|   5.8583417|   1.855027|   1.6229062| -0.07451551|-0.14297475|-0.002936...|-0.72386926|  0.5826918|
|-0.089063354| -1.0029051| 0.027597873|-0.045545746|  0.9653191|-0.002720...| 0.88621795|-0.63630056|
|  0.17687838|  0.4260609| 0.022269458| -0.35598207| -0.7606405|-0.022938322|  1.2746983| -1.2158215|
|  0.40637818|  0.5848349|  0.02567751|  -0.2016987| -0.5030607| -0.03613399| -0.8

                                                                                

In [23]:
def predict_batch_fn():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    def predict(inputs):
        torch_inputs = torch.from_numpy(inputs).to(device)
        outputs = model(torch_inputs) # .flatten()
        return outputs.detach().numpy()

    return predict

In [24]:
from pyspark.ml.functions import predict_batch_udf

classify = predict_batch_udf(predict_batch_fn,
                             return_type=FloatType(),
                             input_tensor_shapes=[[8]],
                             batch_size=50)

In [25]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [26]:
import pyspark.sql.functions as F

preds = df.withColumn("preds", classify(F.struct(*columns)))
preds.show()

[Stage 5:>                                                          (0 + 1) / 1]

+------------+-----------+------------+------------+-----------+-------------+-----------+-----------+---------+
|      MedInc|   HouseAge|    AveRooms|   AveBedrms| Population|     AveOccup|   Latitude|  Longitude|    preds|
+------------+-----------+------------+------------+-----------+-------------+-----------+-----------+---------+
|   0.6213434| -1.9555492|   0.5654933|-0.094237834|  1.3201483|  -0.02452237|  1.3823495| -1.5755241| 2.515409|
|   2.3122587|   1.855027|  0.51577705| -0.10244485| -0.6423641| -0.062409732| 0.84409356| -1.2857636|4.3639803|
|  0.45224655|0.029125877|  0.43466917| -0.23637258|-0.21131223|-0.0011831677| -0.7425912|  1.1272416| 2.168374|
|   5.8583417|   1.855027|   1.6229062| -0.07451551|-0.14297475|-0.0029363814|-0.72386926|  0.5826918| 7.552373|
|-0.089063354| -1.0029051| 0.027597873|-0.045545746|  0.9653191|-0.0027205816| 0.88621795|-0.63630056|1.3233948|
|  0.17687838|  0.4260609| 0.022269458| -0.35598207| -0.7606405| -0.022938322|  1.2746983| -1.21

                                                                                

In [28]:
X_test

array([[ 6.21343361e-01, -1.95554921e+00,  5.65493282e-01, ...,
        -2.45223692e-02,  1.38234945e+00, -1.57552414e+00],
       [ 2.31225880e+00,  1.85502695e+00,  5.15777055e-01, ...,
        -6.24097310e-02,  8.44093558e-01, -1.28576367e+00],
       [ 4.52246550e-01,  2.91258761e-02,  4.34669171e-01, ...,
        -1.18316765e-03, -7.42591188e-01,  1.12724161e+00],
       ...,
       [-2.10343318e-01,  1.85502695e+00, -4.99074110e-02, ...,
        -8.83794576e-02,  9.98549595e-01, -1.33572237e+00],
       [-8.05315528e-01, -1.39984018e+00, -8.21128591e-01, ...,
        -1.05647042e-01,  1.27469827e+00, -1.57052827e+00],
       [ 1.72286874e+00, -2.88422137e-01,  9.32053427e-01, ...,
         1.21162102e-02, -8.12798477e-01,  8.47472885e-01]],
      shape=(4128, 8))

In [29]:
y_test

array([2.408  , 5.00001, 1.346  , ..., 2.44   , 1.042  , 2.701  ],
      shape=(4128,))

In [31]:
#predictions = [x['preds'] for x in preds.select('preds').collect()]
predictions = preds.select('preds').toPandas()['preds'].tolist()

                                                                                

In [32]:
from sklearn.metrics import mean_absolute_error

# Compute MAE
mae = mean_absolute_error(y_test, predictions)

print("Mean Absolute Error on test set:", mae)

Mean Absolute Error on test set: 0.6161130974278431


In [33]:
type(y_test)

numpy.ndarray

In [34]:
torch.tensor(y_test)

tensor([2.4080, 5.0000, 1.3460,  ..., 2.4400, 1.0420, 2.7010],
       dtype=torch.float64)

In [35]:
torch.tensor(predictions)

tensor([2.5154, 4.3640, 2.1684,  ..., 2.4202, 1.7867, 3.3725])

In [36]:
criterion(torch.tensor(y_test), torch.tensor(predictions))

tensor(0.7111, dtype=torch.float64)

In [38]:
import torch
import torch.nn as nn

# Define the neural network model using nn.Sequential
input_size = 8  # Number of features in the California housing dataset
test_model = nn.Sequential(
    nn.Linear(input_size, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)

# Load the trained model's state if available
# Uncomment the following lines if you have a saved model state
model.load_state_dict(torch.load('model.pth'))
model.eval()

# Create an arbitrary input tensor with explicit feature values
# Let's define explicit values for the 8 features:
# - MedInc: Median income in block group
# - HouseAge: Median house age in block group
# - AveRooms: Average number of rooms per household
# - AveBedrms: Average number of bedrooms per household
# - Population: Block group population
# - AveOccup: Average number of household members
# - Latitude: Latitude coordinate
# - Longitude: Longitude coordinate

# For demonstration, we'll use the following values (made-up for this example):
input_tensor = torch.tensor([
    5.0,      # MedInc
    30.0,     # HouseAge
    6.0,      # AveRooms
    1.0,      # AveBedrms
    1000.0,   # Population
    3.0,      # AveOccup
    34.05,    # Latitude (e.g., Los Angeles)
    -118.25   # Longitude (e.g., Los Angeles)
], dtype=torch.float32)


# Perform a forward pass through the model
#output = test_model(input_tensor.unsqueeze(0))
output = model(input_tensor.unsqueeze(0))
output

tensor([[315.5470]], grad_fn=<AddmmBackward0>)