In [1]:
# Imports
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

import torchvision.models as models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import time
import logging
import numpy as np
import pandas as pd
import os
import sys
from torchvision import transforms

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from moth_dataset import MothDataset
from utils import show_sample

In [12]:
# Configuration of Cross Validation and logging

PATH_TO_DATA = 'C:/Users/Leo/Desktop/BA_MothClassification/data/'
PATH_TO_LABELS = PATH_TO_DATA + 'processed/testing_dataset_top20_max50.csv'
PATH_TO_IMAGES = PATH_TO_DATA + 'processed/testing_dataset_top20_max50_images'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/test_baseline.log'

FOLDS = 10

KNN_PARAM_GRID = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance']
    #'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #'leaf_size': [20, 30, 40],
    #'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

SCORER = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Configure logging
logging.basicConfig(
    filename=PATH_TO_LOGFILE,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

### Loading Dataset

In [3]:


# LOADING DATASET

# prepare rows not containing BLACK / CHECK to be inspected by brightness check and objective size estimation
csv_file = pd.read_csv(PATH_TO_LABELS)
csv_file['status'] = csv_file['status'].astype('str') # to ensure status (CHECK, WHITE, BLACK) is of type string

ignore_statuses = ['CHECK', 'BLACK', 'MISSING'] # these statuses are ignored in dataset
csv_file_filtered = csv_file[~csv_file['status'].isin(ignore_statuses)] # selects all samples which's status has not been set to CHECK or BLACK
csv_file_filtered.reset_index(drop=True, inplace=True)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match ResNet input size
    transforms.ToTensor(),          # Convert to tensor
])

full_dataset = MothDataset(csv_file=csv_file_filtered, root_dir=PATH_TO_IMAGES, transform=transform)
#dataloader = DataLoader(full_dataset, batch_size=100, shuffle=False)

# images = [sample[0] for sample in full_dataset]
# labels = [sample[1] for sample in full_dataset]

print(len(csv_file_filtered['gbifID']))
print(len(csv_file_filtered['scientificName']))


985
985


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data_frame['scientificName_encoded'] = self.label_encoder.fit_transform(self.data_frame['scientificName'])


In [None]:


# Define models
def get_model(model_name):
    '''
    third model?
    '''
    if model_name == 'DINOv2':
        model = models.vit_b_16(pretrained=True)  # Placeholder, replace with actual model initialization
    elif model_name == 'ResNet50':
        model = models.resnet50(pretrained=True)
        #model = models.model(weights=models.model_Weights.DEFAULT)
        model.fc = nn.Identity() # Replace fc layer with identity to get embeddings
    else:
        raise ValueError("Unknown model name")
    return model

def extract_features(loader):
    model = get_model('ResNet50')
    features = []
    labels = []
    with torch.no_grad():
        for images, lbls, _, _ in loader:
            outputs = model(images).squeeze().cpu().numpy()
            features.append(outputs)
            labels.append(lbls.cpu().numpy())

    features = np.concatenate(features)
    labels = np.concatenate(labels)
    return features, labels


# combine get_model and extract_features???


In [10]:
# Function to extract features using ResNet50



cv_train_val_indices, test_indices = train_test_split(range(len(full_dataset)), test_size=0.1, random_state=42)
cv_train_val_dataset = torch.utils.data.Subset(full_dataset, cv_train_val_indices)
test_dataset = torch.utils.data.Subset(full_dataset, test_indices)

cv_train_val_loader = DataLoader(cv_train_val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Extract features for training and validation sets
cv_train_val_features, cv_train_val_labels = extract_features(cv_train_val_loader)
test_features, test_labels = extract_features(test_loader)





886 features and 886 labels
99 features and 99 labels


In [13]:


skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
knn = KNeighborsClassifier()
grid_search = GridSearchCV(estimator=knn, param_grid=KNN_PARAM_GRID, scoring='accuracy', cv=skf, verbose=1)
grid_search.fit(cv_train_val_features, cv_train_val_labels) # warum gibt es diese unterscheidung und nicht einfach 


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [16]:

print(f'Cross Validation performed with {len(cv_train_val_features)} features and {len(cv_train_val_labels)} labels.')


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")


# Evaluate the KNN classifier on the validation set using the best estimator found by GridSearchCV

print(f'Final test on best parameters performed with {len(test_features)} features and {len(test_labels)} labels.')

test_predictions = grid_search.best_estimator_.predict(test_features)
accuracy = accuracy_score(test_labels, test_predictions)
print(f"Testing Accuracy: {accuracy:.4f}")

Cross Validation performed with 886 features and 886 labels.
Best Parameters: {'n_neighbors': 9, 'weights': 'distance'}
Best Cross-Validation Accuracy: 0.5554
Final test on best parameters performed with 99 features and 99 labels.
Testing Accuracy: 0.6162


In [None]:
model = get_model('ResNet50')

for fold, (train_idx, val_idx) in enumerate(skf.split(range(len(full_dataset)), labels)):

    
    train_subset = torch.utils.data.Subset(full_dataset, train_idx)
    val_subset = torch.utils.data.Subset(full_dataset, val_idx)
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

    # Extract features and labels for training and validation
    features_train, labels_train = [], []
    features_val, labels_val = [], []

    # Process training data to extract ResNet embeddings
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for images, labels, _, _ in train_loader:
            embeddings = model(images).cpu().numpy()  # Extract features
            features_train.append(embeddings)
            labels_train.append(labels.numpy())

        for images, labels, _, _ in val_loader: # _ _ for gbifid and img_name of samples
            embeddings = model(images).cpu().numpy()
            features_val.append(embeddings)
            labels_val.append(labels.numpy())

    # Concatenate the features and labels
    features_train = np.concatenate(features_train)
    labels_train = np.concatenate(labels_train)
    features_val = np.concatenate(features_val)
    labels_val = np.concatenate(labels_val)

    # Set up KNN and GridSearchCV
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, PARAM_GRID, scoring=SCORER, refit='accuracy', cv=FOLDS)
    
    # Training and timing
    start_train_time = time.time()
    grid_search.fit(features_train, labels_train)
    end_train_time = time.time()

    # Inference and timing
    start_infer_time = time.time()
    predictions = grid_search.predict(features_val)
    end_infer_time = time.time()

    # Calculate metrics
    acc = accuracy_score(labels_val, predictions)
    precision = precision_score(labels_val, predictions, average='weighted')
    recall = recall_score(labels_val, predictions, average='weighted')
    f1 = f1_score(labels_val, predictions, average='weighted')

    # Log results
    logging.info(f"Fold: {fold}, Best Params: {grid_search.best_params_}, Accuracy: {acc}, "
                 f"Precision: {precision}, Recall: {recall}, F1: {f1}, "
                 f"Train Time: {end_train_time - start_train_time:.2f}s, "
                 f"Inference Time: {end_infer_time - start_infer_time:.2f}s")

    print(f"Completed fold {fold} with Accuracy: {acc:.4f}")



Completed fold 0 with Accuracy: 0.4742
Completed fold 1 with Accuracy: 0.4573
Completed fold 2 with Accuracy: 0.4756


### TODO
- implement extensive logging and data gathering for later visualization
- features zwischenspeichern?
- separate PARAMS für Linear o.ä.? -> epochs
- send to device (GPU on Workstation)
- wie kann ich results in csv katalogisieren/kategorisieren
- direkt Ergebnisse anschaulich darstellen?
- einzelne zellenergebnisse zwischenspeichern wenn auf workstation in einem script executed?
- andere datensätze erstellen
- DINOv2 Implementieren
- welche andere Optionen gibt es noch anstatt von DINO2 oder ResNet?
- welche params kommen für KNN CV in frage und warum?

### Todo 16.11:
- [ ] überprüfen ob klassen auch wirklich labels übereinstimmen
- [ ] PAPER lesen von Jonas alle
- [ ] DINOv2 Feature Extraction Implementieren
- [x] implement small testing dataset for best classifier
- [ ] baseline_V1 mit anderem Code vergleichen
- [ ] status column: DL_Fail bei 200 fails eintragen und testlauf?
