In [None]:
# Imports
import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from torchvision import transforms
import torchvision.models as models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import logging
import numpy as np
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from lepidoptera_dataset import LepidopteraDataset
from utils import show_sample, check_folder_exists

In [None]:
# Configuration of Cross Validation and logging

PATH_TO_DATA = 'C:/Users/Leo/Desktop/BA_MothClassification/data/'
PATH_TO_LABELS = PATH_TO_DATA + 'processed/testing_dataset_top20_max50.csv'
PATH_TO_IMAGES = PATH_TO_DATA + 'processed/testing_dataset_top20_max50_images'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/test_baseline.log'
PATH_TO_FEATURES = PATH_TO_DATA + 'processed/features/'
PATH_TO_DATASETS = PATH_TO_DATA + 'processed/resized_datasets/'

FOLDS = 10

KNN_PARAM_GRID = {
    'n_neighbors': [5,10,15,20,25,30],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

SCORER = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

FOUNDATIONAL_MODELS = [
    'ResNet50_ImageNet1KV1', 
    'ResNet50_ImageNet1KV2', 
    'ResNet101_ImageNet1KV1', 
    'ResNet101_ImageNet1KV1'
]

DATASETS = {
    'top277_max3000': PATH_TO_DATASETS + 'dataset_top277_max3000.csv',
    'top277_max2000': PATH_TO_DATASETS + 'dataset_top277_max2000.csv',
    'top277_max1000': PATH_TO_DATASETS + 'dataset_top277_max1000.csv',
    'top277_max500': PATH_TO_DATASETS + 'dataset_top277_max500.csv',

    'top387_max2000': PATH_TO_DATASETS + 'dataset_top387_max2000.csv',
    'top387_max1000': PATH_TO_DATASETS + 'dataset_top387_max1000.csv',
    'top387_max500': PATH_TO_DATASETS + 'dataset_top387_max500.csv',

    'top589_max1000': PATH_TO_DATASETS + 'dataset_top589_max1000.csv',
    'top589_max500': PATH_TO_DATASETS + 'dataset_top589_max500.csv'
}

BASELINE_DATASET = DATASETS['top277_max3000']


# Configure logging
logging.basicConfig(
    filename=PATH_TO_LOGFILE,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device chosen: {device}')

Device chosen: cpu


### Loading Dataset

In [None]:


# LOADING DATASET

# prepare rows not containing BLACK / CHECK to be inspected by brightness check and objective size estimation
csv_file = pd.read_csv(PATH_TO_LABELS)
csv_file['status'] = csv_file['status'].astype('str') # to ensure status (CHECK, WHITE, BLACK) is of type string

ignore_statuses = ['CHECK', 'BLACK', 'MISSING'] # these statuses are ignored in dataset
csv_file_filtered = csv_file[~csv_file['status'].isin(ignore_statuses)] # selects all samples which's status has not been set to CHECK or BLACK
csv_file_filtered.reset_index(drop=True, inplace=True)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match ResNet input size
    transforms.ToTensor(),          # Convert to tensor
])

full_dataset = LepidopteraDataset(csv_file=csv_file_filtered, root_dir=PATH_TO_IMAGES, transform=transform)


print(f'Dataset contains {len(full_dataset)} samples.')

Dataset contains 985 samples.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data_frame['scientificName_encoded'] = self.label_encoder.fit_transform(self.data_frame['scientificName'])


In [None]:


def get_model(model_name):

    if model_name == 'DINOv2basic': 
        model = models.vit_b_16(weights=models.ViT_B_16_Weights.DEFAULT)
        model.head = nn.Identity()
    elif model_name == 'DINOv2large': 
        model = models.vit_l_16(weights=models.ViT_L_16_Weights.DEFAULT)
        model.head = nn.Identity()   

    # different resnets
    elif model_name == 'ResNet50_ImageNet1KV1':
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        model.fc = nn.Identity()
    elif model_name == 'ResNet50_ImageNet1KV2':
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
        model.fc = nn.Identity()
    elif model_name == 'ResNet101_ImageNet1KV1':
        model = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
        model.fc = nn.Identity() #
    elif model_name == 'ResNet101_ImageNet1KV2':
        model = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V2)
        model.fc = nn.Identity()
    else:
        raise ValueError("Unknown model name")
    return model


def extract_features_resnet(loader, model_name):
    model = get_model(model_name)
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for batch, (images, lbls, _, _ ) in enumerate(loader):
            if (batch + 1) % 2 == 0: print(f'Batch [{batch+1}/{len(loader)}] with {len(images)} samples') 
            images = images.to(device)
            outputs = model(images).cpu().numpy()
            features.append(outputs)
            labels.append(lbls.cpu().numpy())

    features = np.concatenate(features)
    labels = np.concatenate(labels)
    return features, labels

def save_features(features, labels, filename): 
    np.savez_compressed(filename, features=features, labels=labels) 
    print(f"Features and labels saved to {filename}") 
    
def load_features(filename): 
    data = np.load(filename) 
    features = data['features'] 
    labels = data['labels'] 
    print(f"Features and labels loaded from {filename}") 
    return features, labels


In [None]:


cv_train_val_indices, test_indices = train_test_split(range(len(full_dataset)), test_size=0.1, random_state=42)
cv_train_val_dataset = torch.utils.data.Subset(full_dataset, cv_train_val_indices)
test_dataset = torch.utils.data.Subset(full_dataset, test_indices)

cv_train_val_loader = DataLoader(cv_train_val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Extract features for training and validation sets

cv_train_val_features, cv_train_val_labels = extract_features_resnet(cv_train_val_loader, MODEL_NAME)
PATH_TO_FEATURES_CV_TRAIN_VAL = PATH_TO_FEATURES + f'cv_train_val_features_{MODEL_NAME}.npz'
save_features(cv_train_val_features, cv_train_val_labels, PATH_TO_FEATURES_CV_TRAIN_VAL)

test_features, test_labels = extract_features_resnet(test_loader, MODEL_NAME)
PATH_TO_FEATURES_TEST = PATH_TO_FEATURES + f'test_features_{MODEL_NAME}.npz'
save_features(test_features, test_labels, PATH_TO_FEATURES_TEST)


Features and labels saved to C:/Users/Leo/Desktop/BA_MothClassification/data/processed/features/cv_train_val_features_ResNet50_IMAGENET1K_V2.npz
Features and labels saved to C:/Users/Leo/Desktop/BA_MothClassification/data/processed/features/test_features_ResNet50_IMAGENET1K_V2.npz


In [99]:
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
knn = KNeighborsClassifier()
grid_search = GridSearchCV(estimator=knn, param_grid=KNN_PARAM_GRID, scoring='accuracy', cv=skf, verbose=1)
loaded_features, loaded_labels = load_features(PATH_TO_FEATURES_CV_TRAIN_VAL)
grid_search.fit(loaded_features, loaded_labels)

Fitting 10 folds for each of 288 candidates, totalling 2880 fits


In [None]:

print(f'Cross Validation performed with {len(cv_train_val_features)} features and {len(cv_train_val_labels)} labels.')


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")


# Evaluate the KNN classifier on the validation set using the best estimator found by GridSearchCV

print(f'Final test on best parameters performed with {len(test_features)} features and {len(test_labels)} labels.')
loaded_features, loaded_labels = load_features(PATH_TO_FEATURES_TEST)

test_predictions = grid_search.best_estimator_.predict(loaded_features)
accuracy = accuracy_score(loaded_labels, test_predictions)
print(f"Testing Accuracy: {accuracy:.4f}")

# save results of runs in processed/results

Cross Validation performed with 886 features and 886 labels.
Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}
Best Cross-Validation Accuracy: 0.6116
Final test on best parameters performed with 99 features and 99 labels.
Testing Accuracy: 0.5859


### TODO
- implement extensive logging and data gathering for later visualization
- separate PARAMS für Linear o.ä.? -> epochs
- wie kann ich results in csv katalogisieren/kategorisieren
- direkt Ergebnisse anschaulich darstellen?
- einzelne zellenergebnisse zwischenspeichern wenn auf workstation in einem script executed?
- andere datensätze erstellen
- DINOv2 Implementieren
- welche andere Optionen gibt es noch anstatt von DINO2 oder ResNet?
- welche params kommen für KNN CV in frage und warum?
- umap
- gewichtung ähnliche klassen nur bei linear?

### Todo Todday:
- create dataset resized for experiment
- dataset status updates (MISSING)
- dataset sort out dark samples (BLACK)

- send to device (GPU on Workstation)
- implement logging
- welche Versionen von ResNet und DINOv2?