## Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder


## Data

In [None]:
url = 'https://raw.githubusercontent.com/Middleton-Lab/abdData/main/inst/extdata/datasets/18/18e4MoleRatLayabouts.csv'

# Read the CSV file with a specific delimiter and encoding
df = pd.read_csv(url, delimiter = ',', encoding = 'utf-8')
df.rename(columns = {'ln.energy': 'log_energy', 'ln.mass': 'log_mass'}, inplace = True)

# Display the first few rows of the DataFrame
df.head()

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(data=df, x='log_mass', y='log_energy', hue='caste')
plt.show()

## Regression of caste and log mass on log energy using `statsmodels`

In [None]:
model_ols = sm.ols("log_energy ~ caste + log_mass", data = df).fit()

print(model_ols.summary())

```
Call:
lm(formula = ln.energy ~ caste + ln.mass, data = D)

Residuals:
     Min       1Q   Median       3Q      Max
-0.73388 -0.19371  0.01317  0.17578  0.47673

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.09687    0.94230  -0.103   0.9188
casteworker  0.39334    0.14611   2.692   0.0112 *
ln.mass      0.89282    0.19303   4.625 5.89e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2966 on 32 degrees of freedom
Multiple R-squared:  0.409,	Adjusted R-squared:  0.3721
F-statistic: 11.07 on 2 and 32 DF,  p-value: 0.0002213
```

## Format data for NN

## Define the loss function and optimizer

## Train the model

In [None]:
# Numeric feature
x1 = df['log_mass'].values.reshape(-1, 1)

In [None]:
# Categorical feature
# Encode the categorical feature
encoder = OneHotEncoder(sparse_output = False)
x2_encoded = encoder.fit_transform(df['caste'].values.reshape(-1, 1))

# Combine features
X = np.hstack((x1, x2_encoded))

In [None]:
# Target variable
y = df['log_energy'].values.reshape(-1, 1)

In [None]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype = torch.float32)
y_tensor = torch.tensor(y, dtype = torch.float32)

In [None]:
# Step 2: Define the model
class MultipleRegressionModel(nn.Module):
    def __init__(self):
        super(MultipleRegressionModel, self).__init__()
        self.linear = nn.Linear(3, 1)  # Input dimension is 3 (x1, x2_encoded), output dimension is 1

    def forward(self, x):
        return self.linear(x)

model = MultipleRegressionModel()

# Step 3: Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [None]:
# Step 4: Train the model
num_epochs = 1000
loss_values = []  # List to store loss values

for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    # Store loss value
    loss_values.append(np.log(loss.item()))


In [None]:
plt.figure(figsize=(10, 5))
ax = sns.lineplot(x = range(1, num_epochs + 1), y = loss_values, color = "darkred")
ax.set_xlabel('Epoch')
ax.set_ylabel('log Loss')
ax.set_title('Training Loss')
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
plt.plot(range(1, num_epochs + 1), loss_values)
plt.xlabel('Epoch')
plt.ylabel('log-Loss')
plt.title('log-Loss vs. Epoch')
plt.show()

## Predictions

In [None]:
model.eval()
predicted = model(X_tensor).detach().numpy()

In [None]:
df['predicted_energy'] = predicted

plt.figure(figsize = (10, 5))
ax = sns.scatterplot(data = df, x = 'log_mass', y = 'log_energy', hue = 'caste')
sns.scatterplot(data = df, x = 'log_mass', y = 'predicted_energy', hue = 'caste', ax = ax, marker = 'x')
plt.legend()
plt.show()