Idea:https://www.kaggle.com/code/stefancomanita/regression-with-neural-networks-using-pytorc

# Mass Prediction

In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Check GPU support

In [2]:
if torch.cuda.is_available():
    print("CUDA (GPU support) is available in PyTorch!")
    device = torch.device("cuda")
else:
    print("CUDA (GPU support) is not available. Using CPU.")
    device = torch.device("cpu")

CUDA (GPU support) is available in PyTorch!


### Load dataset

In [3]:
HOME = os.getcwd()
csv_path = os.path.join(HOME, 'output.csv')
data = pd.read_csv(csv_path)

In [4]:
data.head()

Unnamed: 0,image_name,object_id,x_center,y_center,width,height,image_area,area,mass
0,20231103_115922,12.0,0.238281,0.453125,0.554688,0.638672,16449,2644.837592,
1,20231103_115922,5.0,0.507812,0.083984,0.845703,0.5,33923,5454.485113,
2,20231103_122808,33.0,0.519531,0.449219,0.724609,0.701172,14953,3171.787338,
3,20231103_122808,0.0,0.099609,0.220703,0.457031,0.546875,32178,6825.504779,
4,20231103_123054,5.0,0.511719,0.089844,0.800781,0.435547,25249,4428.619201,


### Drop row without mass and area

### Calculate ID counts

In [5]:
value_counts = data['object_id'].value_counts()
value_counts

object_id
9.0     583
5.0     546
18.0    494
0.0     390
17.0    374
21.0    302
2.0     273
29.0    236
31.0    236
30.0    236
28.0    229
11.0    206
27.0    179
37.0    172
15.0    151
10.0    149
26.0    126
23.0    119
4.0     117
13.0    111
1.0      90
33.0     90
39.0     87
19.0     58
35.0     55
16.0     44
24.0     39
32.0     37
36.0     37
12.0     37
20.0     29
34.0     28
22.0     15
38.0     13
6.0      12
7.0       9
25.0      9
14.0      5
3.0       1
Name: count, dtype: int64

### Drop row less than 30

In [6]:
#ids_to_keep = value_counts[value_counts >= 30].index
#data = data[data['object_id'].isin(ids_to_keep)]

In [7]:
data.dropna(subset=['area'], inplace=True)
data.dropna(subset=['mass'], inplace=True)

### Drop unnecessary column

In [8]:
X = data.drop(columns=['mass', 'image_name', 'x_center', 'y_center', 'width', 'height'], axis=1)
y = data['mass'].copy()

X, y

(      object_id  image_area         area
 2287        0.0       35176  6950.489050
 2288       11.0       28482  5627.809561
 2289        5.0       41530  8205.987328
 2290        0.0       35552  7101.427179
 2291        5.0       42119  8413.169761
 ...         ...         ...          ...
 3519       30.0       26907  5189.143068
 3520        1.0       13331  2570.946826
 3521        9.0       13447  2602.772291
 3522       30.0       28809  5576.207848
 3523        1.0       13979  2705.745062
 
 [1083 rows x 3 columns],
 2287    61.0
 2288    52.0
 2289    75.0
 2290    61.0
 2291    75.0
         ... 
 3519    42.0
 3520    29.0
 3521    38.0
 3522    42.0
 3523    29.0
 Name: mass, Length: 1083, dtype: float64)

In [9]:
X_cat = X['object_id'].copy()
X = X.drop(columns=['object_id'])
print(X)

      image_area         area
2287       35176  6950.489050
2288       28482  5627.809561
2289       41530  8205.987328
2290       35552  7101.427179
2291       42119  8413.169761
...          ...          ...
3519       26907  5189.143068
3520       13331  2570.946826
3521       13447  2602.772291
3522       28809  5576.207848
3523       13979  2705.745062

[1083 rows x 2 columns]


### Create one-hot encoding

In [10]:
# Import necessary library
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the 'object_id' column
encoder = OneHotEncoder(sparse=False)
X_cat_one_hot = encoder.fit_transform(X_cat.values.reshape(-1, 1))

# Convert to a DataFrame
X_cat_one_hot_df = pd.DataFrame(X_cat_one_hot, columns=encoder.get_feature_names_out(['object_id']))

# Reset index of X and concatenate with the one-hot encoded DataFrame
X.reset_index(drop=True, inplace=True)
X = pd.concat([X, X_cat_one_hot_df], axis=1)

print(X.head())
print(X.shape)

   image_area         area  object_id_0.0  object_id_1.0  object_id_5.0  \
0       35176  6950.489050            1.0            0.0            0.0   
1       28482  5627.809561            0.0            0.0            0.0   
2       41530  8205.987328            0.0            0.0            1.0   
3       35552  7101.427179            1.0            0.0            0.0   
4       42119  8413.169761            0.0            0.0            1.0   

   object_id_9.0  object_id_10.0  object_id_11.0  object_id_13.0  \
0            0.0             0.0             0.0             0.0   
1            0.0             0.0             1.0             0.0   
2            0.0             0.0             0.0             0.0   
3            0.0             0.0             0.0             0.0   
4            0.0             0.0             0.0             0.0   

   object_id_18.0  object_id_19.0  object_id_20.0  object_id_25.0  \
0             0.0             0.0             0.0             0.0   
1 



### Creating training & testing sets

In [11]:
# Scale the features
X = StandardScaler().fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train, X_test, y_train, y_test = map(
    torch.tensor, (X_train, X_test, y_train, y_test)
)

# Create Tensor datasets
train_ds = TensorDataset(X_train.float(), y_train.float())
test_ds = TensorDataset(X_test.float(), y_test.float())

# Data loaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

### Define model

In [12]:
'''# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X.shape[1], 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 25)
        self.fc4 = nn.Linear(25, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    #this is mandatory
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

model = MLP().to(device)'''

'# Define the MLP model\nclass MLP(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.fc1 = nn.Linear(X.shape[1], 100)\n        self.fc2 = nn.Linear(100, 50)\n        self.fc3 = nn.Linear(50, 25)\n        self.fc4 = nn.Linear(25, 1)\n        self.relu = nn.ReLU()\n        self.dropout = nn.Dropout(0.2)\n\n    #this is mandatory\n    def forward(self, x):\n        x = self.relu(self.fc1(x))\n        x = self.dropout(x)\n        x = self.relu(self.fc2(x))\n        x = self.dropout(x)\n        x = self.relu(self.fc3(x))\n        x = self.fc4(x)\n        return x\n\nmodel = MLP().to(device)'

In [13]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X.shape[1], 400)
        self.fc2 = nn.Linear(400, 100)
        self.fc3 = nn.Linear(100, 25)
        self.fc4 = nn.Linear(25, 1)
        self.relu = nn.ReLU()

    #this is mandatory
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

model = MLP().to(device)

### Define loss function and optimizer

In [14]:
# Loss function and optimizer
criterion = nn.MSELoss()
#add L2 Regularization(weight_decay), larger regularize the model more
#optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training & Evalulation

In [15]:
import numpy as np
# Training loop
num_epochs = 400
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
      inputs, targets = inputs.to(device), targets.to(device)
      optimizer.zero_grad()
      outputs = model(inputs).squeeze()
      loss = criterion(outputs, targets)
      loss.backward()
      optimizer.step()

    # Evaluation with mean squared error
    model.eval()
    with torch.no_grad():
        mse = 0
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs).squeeze()
            mse += criterion(outputs, targets).item()

    mse /= len(test_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, MSE: {mse:.4f}')
    print(f'RMSE:{mse ** (1/2):.4f}')

print("Training complete")

Epoch 1/400, MSE: 1648.5293
RMSE:40.6021
Epoch 2/400, MSE: 298.4952
RMSE:17.2770
Epoch 3/400, MSE: 153.6153
RMSE:12.3942
Epoch 4/400, MSE: 117.6322
RMSE:10.8458
Epoch 5/400, MSE: 112.6375
RMSE:10.6131
Epoch 6/400, MSE: 109.1586
RMSE:10.4479
Epoch 7/400, MSE: 104.5098
RMSE:10.2230
Epoch 8/400, MSE: 101.9867
RMSE:10.0988
Epoch 9/400, MSE: 100.0048
RMSE:10.0002
Epoch 10/400, MSE: 101.3140
RMSE:10.0655
Epoch 11/400, MSE: 99.2365
RMSE:9.9618
Epoch 12/400, MSE: 97.4611
RMSE:9.8722
Epoch 13/400, MSE: 96.4061
RMSE:9.8187
Epoch 14/400, MSE: 94.4139
RMSE:9.7167
Epoch 15/400, MSE: 97.1051
RMSE:9.8542
Epoch 16/400, MSE: 96.0264
RMSE:9.7993
Epoch 17/400, MSE: 92.4041
RMSE:9.6127
Epoch 18/400, MSE: 91.4693
RMSE:9.5640
Epoch 19/400, MSE: 93.2262
RMSE:9.6554
Epoch 20/400, MSE: 94.9020
RMSE:9.7418
Epoch 21/400, MSE: 90.1697
RMSE:9.4958
Epoch 22/400, MSE: 91.5692
RMSE:9.5692
Epoch 23/400, MSE: 89.9225
RMSE:9.4827
Epoch 24/400, MSE: 88.4589
RMSE:9.4053
Epoch 25/400, MSE: 89.7729
RMSE:9.4749
Epoch 26/400,