In [26]:
import numpy as np
import sklearn
import torch
import os

In [27]:
if not os.path.exists('tree_species_classifier_data.npz'):
  !wget -O tree_species_classifier_data.npz "https://www.dropbox.com/scl/fi/b7mw23k3ifaeui9m8nnn3/tree_species_classifier_data.npz?rlkey=bgxp37c1t04i7q35waf3slc26&dl=1"

In [28]:
data = np.load('tree_species_classifier_data.npz')
train_features = data['train_features']
train_labels = data['train_labels']
test_features = data['test_features']
test_labels = data['test_labels']

# what is the shape and data type of each provided matrix?
print(f'train_features shape: {train_features.shape}, dtype: {train_features.dtype}')
print(f'train_labels shape: {train_labels.shape}, dtype: {train_labels.dtype}')
print(f'test_features shape: {test_features.shape}, dtype: {test_features.dtype}')
print(f'test_labels shape: {test_labels.shape}, dtype: {test_labels.dtype}')
# what are the rows and columns of the matrix? - from paper
# what are the ranges?
print(f'train_features min: {train_features.min()}, max: {train_features.max()}')
print(f'train_labels min: {train_labels.min()}, max: {train_labels.max()}')
print(f'test_features min: {test_features.min()}, max: {test_features.max()}')
print(f'test_labels min: {test_labels.min()}, max: {test_labels.max()}')
# How many classes are there and what are the classes? - from paper
# How many examples are provided of each class in the train and test splits?
# get value counts of each class
train_class_counts = np.unique(train_labels, return_counts=True)
test_class_counts = np.unique(test_labels, return_counts=True)
print(f'train class counts: {train_class_counts}')
print(f'test class counts: {test_class_counts}')


train_features shape: (15707, 426), dtype: int16
train_labels shape: (15707,), dtype: uint8
test_features shape: (1554, 426), dtype: int16
test_labels shape: (1554,), dtype: uint8
train_features min: 0, max: 14998
train_labels min: 0, max: 7
test_features min: 0, max: 6908
test_labels min: 0, max: 7
train class counts: (array([0, 1, 2, 3, 4, 5, 6, 7], dtype=uint8), array([2519,  821, 1575, 3980, 2640,   88,  852, 3232]))
test class counts: (array([0, 1, 2, 3, 4, 5, 6, 7], dtype=uint8), array([389,  30, 278, 404, 100,  22,  43, 288]))


In [32]:
# Preprocess the data
from sklearn.decomposition import PCA

pca = PCA(n_components=32, whiten=True)
pca.fit(train_features)
train_features_pca = pca.transform(train_features)
test_features_pca = pca.transform(test_features)

# Train a linear classifier and a neural network
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(train_features_pca, train_labels)
lr_preds = lr.predict(test_features_pca)
lr_acc = accuracy_score(test_labels, lr_preds)

nnet = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000)
nnet.fit(train_features_pca, train_labels)
nn_preds = nnet.predict(test_features_pca)
nn_acc = accuracy_score(test_labels, nn_preds)

print("Accuracy of the classifiers on the test set:")
print(f'Linear classifier accuracy: {lr_acc:.2f}')
print(f'Neural network classifier accuracy: {nn_acc:.2f}')
print()

print("Accuracy of the classifiers on the training set:")
# Get accuracy of the linear classifier on the training set
lr_train_preds = lr.predict(train_features_pca)
lr_train_acc = accuracy_score(train_labels, lr_train_preds)
print(f'Linear classifier training accuracy: {lr_train_acc:.2f}')

# Get accuracy of the neural network on the training set
nn_train_preds = nnet.predict(train_features_pca)
nn_train_acc = accuracy_score(train_labels, nn_train_preds)
print(f'Neural network training accuracy: {nn_train_acc:.2f}')

Accuracy of the classifiers on the test set:
Linear classifier accuracy: 0.83
Neural network classifier accuracy: 0.83

Accuracy of the classifiers on the training set:
Linear classifier training accuracy: 0.86
Neural network training accuracy: 1.00


In [30]:
# Classifiers implemented in PyTorch

from torch.utils.data import TensorDataset, DataLoader
from torch import nn as nn
from torch import optim

train_dataset = TensorDataset(torch.tensor(train_features_pca, dtype=torch.float32), torch.tensor(train_labels, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(torch.tensor(test_features_pca, dtype=torch.float32), torch.tensor(test_labels, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

def calculate_accuracy(model, loader):
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in loader:
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

def train_model(model, train_loader, test_loader, num_epochs=100):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.001)
    for epoch in range(num_epochs):
        model.train()
        for features, labels in train_loader:
            optimizer.zero_grad()
            y_pred = model(features)
            loss = loss_fn(y_pred, labels)
            loss.backward()
            optimizer.step()
        model.eval()
        train_acc = calculate_accuracy(model, train_loader)
        test_acc = calculate_accuracy(model, test_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Accuracy: {train_acc:.2f}, Test Accuracy: {test_acc:.2f}')

lm = nn.Linear(32, 8)
train_model(lm, train_loader, test_loader)

print()

nnet = nn.Sequential(
    nn.Linear(32, 100),
    nn.ReLU(),
    nn.Linear(100, 100),
    nn.ReLU(),
    nn.Linear(100, 100),
    nn.ReLU(),
    nn.Linear(100, 8)
)
train_model(nnet, train_loader, test_loader)

Epoch 1/100, Train Accuracy: 0.76, Test Accuracy: 0.74
Epoch 2/100, Train Accuracy: 0.79, Test Accuracy: 0.79
Epoch 3/100, Train Accuracy: 0.81, Test Accuracy: 0.79
Epoch 4/100, Train Accuracy: 0.82, Test Accuracy: 0.80
Epoch 5/100, Train Accuracy: 0.82, Test Accuracy: 0.81
Epoch 6/100, Train Accuracy: 0.83, Test Accuracy: 0.81
Epoch 7/100, Train Accuracy: 0.83, Test Accuracy: 0.81
Epoch 8/100, Train Accuracy: 0.83, Test Accuracy: 0.81
Epoch 9/100, Train Accuracy: 0.83, Test Accuracy: 0.81
Epoch 10/100, Train Accuracy: 0.83, Test Accuracy: 0.82
Epoch 11/100, Train Accuracy: 0.83, Test Accuracy: 0.82
Epoch 12/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 13/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 14/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 15/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 16/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 17/100, Train Accuracy: 0.84, Test Accuracy: 0.82
Epoch 18/100, Train Accuracy: 0.84, Test Accuracy: 0.82
E