In [None]:
%%capture
!pip install datasets timm

# Import necessary libraries
from datetime import datetime
t = datetime.now()  # Record the current time
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from timm import create_model
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_curve, auc
import torch.backends.cudnn as cudnn
import matplotlib.pyplot as plt
import seaborn as sns
import torchvision as transforms
from sklearn.preprocessing import label_binarize
import numpy as np

# Clear the CUDA cache
torch.cuda.empty_cache()
# Enable benchmark mode in cuDNN to optimize performance
cudnn.benchmark = True

# Define constants
IMG_SIZE = 256  # Image size
EPOCHS = 6  # Number of epochs
BATCH_SIZE = 64  # Batch size
N_CLASSES = 31  # Number of classes
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Device to use (GPU if available)
FP16 = True  # Use mixed precision training
LR_MAX = 5e-5  # Maximum learning rate

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Download

In [None]:
# Load the dataset from Hugging Face
ds = load_dataset("mikkoim/aquamonitor-jyu", cache_dir="data")

# Download the metadata file from Hugging Face Hub
hf_hub_download(repo_id="mikkoim/aquamonitor-jyu",
                filename="aquamonitor-jyu.parquet.gzip",
                repo_type="dataset",
                local_dir=".")

# Read the metadata into a pandas DataFrame
metadata = pd.read_parquet('aquamonitor-jyu.parquet.gzip')

# Get the unique classes from the metadata and sort them
classes = sorted(metadata["taxon_group"].unique())

# Create a mapping from class names to integer labels
class_map = {k: v for v, k in enumerate(classes)}

# Create an inverse mapping from integer labels to class names
class_map_inv = {v: k for k, v in class_map.items()}

# Remove the ".jpg" suffix from the image filenames in the metadata
metadata["img"] = metadata["img"].str.removesuffix(".jpg")

# Create a dictionary mapping image filenames to their corresponding labels
label_dict = dict(zip(metadata["img"], metadata["taxon_group"].map(class_map)))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.73k [00:00<?, ?B/s]

train.tar:   0%|          | 0.00/143M [00:00<?, ?B/s]

val.tar:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40880 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6394 [00:00<?, ? examples/s]

aquamonitor-jyu.parquet.gzip:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

In [None]:
# Define transformations for training data
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to 256x256
    transforms.RandomApply([transforms.RandomRotation(10)], p=0.7),  # Randomly rotate images by 10 degrees with 70% probability
    transforms.RandomApply([transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05)], p=0.7),  # Randomly change brightness, contrast, saturation, and hue with 70% probability
    transforms.RandomApply([transforms.RandomResizedCrop(256, scale=(0.9, 1.0))], p=0.7),  # Randomly crop and resize images with 70% probability
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.2),  # Apply Gaussian blur with 20% probability
    transforms.RandomApply([transforms.RandomAffine(degrees=5, translate=(0.05, 0.05))], p=0.7),  # Random affine transformation with 70% probability
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images with mean and std
])

# Define stronger transformations for rare classes
strong_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),  # Randomly flip images horizontally with 50% probability
    transforms.RandomRotation(degrees=30),  # Randomly rotate images by 30 degrees
    transforms.RandomPerspective(distortion_scale=0.5, p=0.5),  # Apply random perspective transformation with 50% probability
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.5),  # Apply Gaussian blur with 50% probability
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Randomly change brightness and contrast
    transforms.Resize((256, 256)),  # Resize images to 256x256
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images with mean and std
])

# Define transformations for validation data
val_transforms = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to 256x256
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images with mean and std
])

# Identify rare classes with less than 1700 samples
rare_classes = {label for label, count in metadata["taxon_group"].value_counts().items() if count < 1700}

# Preprocess function to apply transformations to the dataset
def preprocess(batch, is_validation=False):
    images, labels = [], []
    for img, key in zip(batch["jpg"], batch["__key__"]):
        label = label_dict[key]
        if is_validation:
            img = val_transforms(img)  # Apply validation transformations
        else:
            if class_map_inv[label] in rare_classes:
                img = strong_transforms(img)  # Apply stronger transformations for rare classes
            else:
                img = train_transforms(img)  # Apply regular training transformations
        images.append(img)
        labels.append(torch.tensor(label, dtype=torch.long))  # Convert labels to tensor
    return {"img": images, "label": labels}


In [None]:
ds_train = ds["train"].with_transform(lambda batch: preprocess(batch, is_validation=False))
# Apply preprocessing to the validation dataset
ds_val = ds["validation"].with_transform(lambda batch: preprocess(batch, is_validation=True))

train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=2, pin_memory=True, prefetch_factor=4)
val_loader = DataLoader(ds_val, batch_size=BATCH_SIZE,
                        num_workers=2, pin_memory=True, prefetch_factor=4)

In [None]:
# Create the model using the Swin Transformer V2 architecture
model = create_model(
    "swinv2_base_window16_256",  # Model name
    pretrained=True,  # Use pretrained weights
    num_classes=N_CLASSES,  # Number of output classes
    drop_path_rate=0.2  # Drop path rate for regularization
)

for param in model.patch_embed.parameters():
    param.requires_grad = False

for param in model.layers[:2].parameters():
    param.requires_grad = False

model = model.to(DEVICE)

model = torch.compile(model)

model.safetensors:   0%|          | 0.00/357M [00:00<?, ?B/s]

In [None]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=LR_MAX, weight_decay=0.05)
# Define the learning rate scheduler using OneCycleLR
lr_scheduler = OneCycleLR(
    optimizer,
    max_lr=LR_MAX,  # Maximum learning rate
    steps_per_epoch=len(train_loader),  # Number of steps per epoch
    epochs=EPOCHS,  # Total number of epochs
    pct_start=0.2,  # Percentage of the cycle spent increasing the learning rate
    anneal_strategy="cos",  # Annealing strategy (cosine)
    div_factor=25,  # Initial division factor for the learning rate
    final_div_factor=10000  # Final division factor for the learning rate
)

scaler = torch.amp.GradScaler('cuda',enabled=FP16)


In [None]:
def validate():
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    # Initialize lists to store predictions, labels, and scores
    all_preds, all_labels, all_scores = [], [], []
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation for validation
        for batch in val_loader:
            images, labels = batch["img"].to(DEVICE), batch["label"].to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total  # Calculate accuracy
    f1 = f1_score(all_labels, all_preds, average='weighted')  # Calculate F1 score

    return total_loss / len(val_loader), f1, accuracy, all_labels, all_preds, all_scores


def train():
    best_f1 = 0
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': [],
        'val_f1': []
    }

    for epoch in range(EPOCHS):
        model.train()  # Set the model to training mode
        total_loss = 0.0
        train_correct = 0
        train_total = 0

        loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{EPOCHS}]")  # Progress bar for training
        for batch in loop:
            images, labels = batch["img"].to(DEVICE), batch["label"].to(DEVICE)
            optimizer.zero_grad()  # Zero the parameter gradients

            with torch.amp.autocast('cuda', enabled=FP16):  # Mixed precision training
                outputs = model(images)
                loss = criterion(outputs, labels)

            if FP16:
                scaler.scale(loss).backward()  # Backward pass with scaling
                scaler.step(optimizer)  # Optimizer step with scaling
                scaler.update()  # Update the scaler
            else:
                loss.backward()  # Backward pass
                optimizer.step()  # Optimizer step

            lr_scheduler.step()  # Update learning rate
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            train_correct += (preds == labels).sum().item()
            train_total += labels.size(0)

            loop.set_postfix(loss=loss.item())  # Update progress bar with loss

        train_loss = total_loss / len(train_loader)  # Calculate average training loss
        train_acc = train_correct / train_total  # Calculate training accuracy

        val_loss, val_f1, val_acc, y_true, y_pred, y_scores = validate()  # Validate the model

        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_acc:.4f}, Val F1 = {val_f1:.4f}")

        if val_f1 > best_f1:  # Save the best model based on F1 score
            best_f1 = val_f1
            torch.save(model.state_dict(), "best_model.pt")
            print(f"Best model saved with F1 Score: {best_f1:.4f}")

        torch.cuda.empty_cache()  # Clear the CUDA cache

    return history

In [None]:
# Train the model and store the training history
history = train()

Epoch [1/6]:   0%|          | 0/639 [00:00<?, ?it/s]W0315 17:15:06.045000 2368 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
Epoch [1/6]: 100%|██████████| 639/639 [18:05<00:00,  1.70s/it, loss=0.805]


Epoch 1: Train Loss = 1.7732, Train Accuracy = 0.6218, Val Loss = 1.4043, Val Accuracy = 0.7343, Val F1 = 0.7051
Best model saved with F1 Score: 0.7051


Epoch [2/6]: 100%|██████████| 639/639 [09:52<00:00,  1.08it/s, loss=0.768]


Epoch 2: Train Loss = 0.7627, Train Accuracy = 0.9769, Val Loss = 1.3935, Val Accuracy = 0.7390, Val F1 = 0.7119
Best model saved with F1 Score: 0.7119


Epoch [3/6]: 100%|██████████| 639/639 [09:54<00:00,  1.07it/s, loss=0.689]


Epoch 3: Train Loss = 0.6942, Train Accuracy = 0.9914, Val Loss = 1.3057, Val Accuracy = 0.7889, Val F1 = 0.7654
Best model saved with F1 Score: 0.7654


Epoch [4/6]: 100%|██████████| 639/639 [09:50<00:00,  1.08it/s, loss=0.659]


Epoch 4: Train Loss = 0.6728, Train Accuracy = 0.9962, Val Loss = 1.3079, Val Accuracy = 0.7848, Val F1 = 0.7636


Epoch [5/6]: 100%|██████████| 639/639 [09:50<00:00,  1.08it/s, loss=0.659]


Epoch 5: Train Loss = 0.6629, Train Accuracy = 0.9984, Val Loss = 1.3183, Val Accuracy = 0.7857, Val F1 = 0.7625


Epoch [6/6]: 100%|██████████| 639/639 [09:51<00:00,  1.08it/s, loss=0.653]


Epoch 6: Train Loss = 0.6599, Train Accuracy = 0.9989, Val Loss = 1.3104, Val Accuracy = 0.7857, Val F1 = 0.7629


In [None]:
# Load the best model weights
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

all_preds, all_labels, all_scores = [], [], []
with torch.no_grad():
    for batch in tqdm(val_loader):
        images, labels = batch["img"].to(DEVICE), batch["label"].to(DEVICE)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_scores.extend(torch.softmax(outputs, dim=1).cpu().numpy())

print("Training complete. Best model saved as best_model.pth")
print(classification_report(all_labels, all_preds, zero_division=0))
print("Training time:", datetime.now() - t)

100%|██████████| 100/100 [01:48<00:00,  1.08s/it]

Training complete. Best model saved as best_model.pth
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       474
           1       0.47      0.85      0.60       200
           2       0.72      0.50      0.59       280
           3       1.00      0.96      0.98       240
           4       0.74      0.98      0.84       309
           5       1.00      0.05      0.10       158
           6       0.55      0.90      0.68        80
           7       0.00      0.00      0.00       120
           8       1.00      1.00      1.00        40
           9       0.94      0.91      0.92       210
          10       1.00      1.00      1.00        15
          11       0.69      0.63      0.66        65
          12       1.00      0.38      0.55        76
          13       1.00      0.56      0.72       107
          14       0.97      0.94      0.96       227
          15       0.71      0.98      0.82       200
          16       0.98    




In [None]:
# Save the model's state dictionary to a file named "best_model.pt"
torch.save(model.state_dict(), "best_model.pt")