In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score

In [2]:
X = pd.read_csv("./data/lucas_organic_carbon_training_and_test_data_NEW.csv")
labels = pd.read_csv("./data/lucas_organic_carbon_target.csv")
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels.x)

In [3]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [5]:
X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
X_val, y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long)
X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)

In [6]:
# Create a custom neural network class with multiple hidden layers
class ComplexSoilClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(ComplexSoilClassifier, self).__init__()
        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_dims[i], hidden_dims[i + 1]) for i in range(len(hidden_dims) - 1)])
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

    def forward(self, x):
        x = torch.relu(self.input_layer(x))
        for layer in self.hidden_layers:
            x = torch.relu(layer(x))
        x = self.output_layer(x)
        return x


In [7]:
# Define hyperparameters
input_dim = X_train.shape[1]
hidden_dims = [512, 256, 128, 64, 32]  # You can adjust the number of neurons in each hidden layer
output_dim = len(torch.unique(y_train))

In [8]:
model = ComplexSoilClassifier(input_dim, hidden_dims, output_dim)

In [9]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
import time

# Train the neural network
num_epochs = 250
batch_size = 64

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    print(f"starting epoch {epoch} out of {num_epochs}")
    fit_time = time.perf_counter()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    fit_time = np.round(time.perf_counter() - fit_time)
    print(f"finished epoch {epoch} in {fit_time}s")

starting epoch 0 out of 250


2023-11-07 11:30:30.720 python[84598:4282213] apply_selection_policy_once: avoid use of removable GPUs (via (null):GPUSelectionPolicy->avoidRemovable)


finished epoch 0 in 1.0s
starting epoch 1 out of 250
finished epoch 1 in 1.0s
starting epoch 2 out of 250
finished epoch 2 in 1.0s
starting epoch 3 out of 250
finished epoch 3 in 1.0s
starting epoch 4 out of 250
finished epoch 4 in 1.0s
starting epoch 5 out of 250
finished epoch 5 in 1.0s
starting epoch 6 out of 250
finished epoch 6 in 1.0s
starting epoch 7 out of 250
finished epoch 7 in 1.0s
starting epoch 8 out of 250
finished epoch 8 in 1.0s
starting epoch 9 out of 250
finished epoch 9 in 1.0s
starting epoch 10 out of 250
finished epoch 10 in 1.0s
starting epoch 11 out of 250
finished epoch 11 in 1.0s
starting epoch 12 out of 250
finished epoch 12 in 1.0s
starting epoch 13 out of 250
finished epoch 13 in 1.0s
starting epoch 14 out of 250
finished epoch 14 in 1.0s
starting epoch 15 out of 250
finished epoch 15 in 1.0s
starting epoch 16 out of 250
finished epoch 16 in 1.0s
starting epoch 17 out of 250
finished epoch 17 in 1.0s
starting epoch 18 out of 250
finished epoch 18 in 1.0s
sta

In [11]:
# Evaluate the model on the validation set
model.eval()
with torch.no_grad():
    outputs = model(X_val)
    _, predicted = torch.max(outputs, 1)
    validation_accuracy = accuracy_score(y_val, predicted.numpy())
    print(f"Validation Accuracy: {validation_accuracy}")

Validation Accuracy: 0.781986531986532


In [12]:
model

ComplexSoilClassifier(
  (input_layer): Linear(in_features=4000, out_features=512, bias=True)
  (hidden_layers): ModuleList(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=128, bias=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): Linear(in_features=64, out_features=32, bias=True)
  )
  (output_layer): Linear(in_features=32, out_features=5, bias=True)
)

In [15]:
# Once you're satisfied with the performance on the validation set, you can evaluate the model on the test set
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    test_accuracy = accuracy_score(y_test, predicted.numpy())
    print("Classification Report:")
    print(classification_report(y_test, predicted.numpy()))
    f1 = f1_score(y_test, predicted.numpy(), average='weighted')
    print(f"F1: {f1}")
    print(f"Test Accuracy: {test_accuracy}")

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.56      0.57       130
           1       0.42      0.38      0.40       224
           2       0.47      0.42      0.44       190
           3       0.53      0.61      0.57        28
           4       0.91      0.94      0.93      1407

    accuracy                           0.80      1979
   macro avg       0.58      0.58      0.58      1979
weighted avg       0.79      0.80      0.79      1979

F1: 0.7920711109535548
Test Accuracy: 0.7973724103082365
