In [1]:
# Imports
import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from torchvision import transforms
import torchvision.models as models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import logging
import numpy as np
import pandas as pd
import os
import sys
import gc

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from lepidoptera_dataset import LepidopteraDataset
from utils import show_sample, check_folder_exists

In [2]:
# Configuration of Cross Validation and logging

PATH_TO_DATA = '/home/lgierz/BA_MothClassification/data/'
PATH_TO_DATASETS = PATH_TO_DATA + 'processed/cv_datasets/'
PATH_TO_IMAGES = '/mnt/data/lgierz/moth_dataset_top589_max3000/'
PATH_TO_FEATURES = PATH_TO_DATA + 'processed/features/'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/feature_extraction.log'

for folder in [PATH_TO_DATA, PATH_TO_DATASETS, PATH_TO_IMAGES, PATH_TO_FEATURES]:
    check_folder_exists(folder, min_fileamount=0)

FOLDS = 10

KNN_PARAM_GRID = {
    'n_neighbors': [5,10,15,20,30],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree'],#, 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

SCORER = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

FOUNDATIONAL_MODELS = [
    'ResNet50_ImageNet1KV1', 
    'ResNet50_ImageNet1KV2', 
    'ResNet101_ImageNet1KV1', 
    'ResNet101_ImageNet1KV1'
]

DATASETS = {
    'top277_max3000': PATH_TO_DATASETS + 'dataset_top277_max3000.csv',
    'top277_max2000': PATH_TO_DATASETS + 'dataset_top277_max2000.csv',
    'top277_max1000': PATH_TO_DATASETS + 'dataset_top277_max1000.csv',
    'top277_max500': PATH_TO_DATASETS + 'dataset_top277_max500.csv',

    'top387_max2000': PATH_TO_DATASETS + 'dataset_top387_max2000.csv',
    'top387_max1000': PATH_TO_DATASETS + 'dataset_top387_max1000.csv',
    'top387_max500': PATH_TO_DATASETS + 'dataset_top387_max500.csv',

    'top589_max1000': PATH_TO_DATASETS + 'dataset_top589_max1000.csv',
    'top589_max500': PATH_TO_DATASETS + 'dataset_top589_max500.csv'
}

DATASET_NAME = 'top277_max3000'
MODEL_NAME = FOUNDATIONAL_MODELS[1]

PATH_TO_LABELS = DATASETS[DATASET_NAME]


# Configure logging
logging.basicConfig(
    filename=PATH_TO_LOGFILE,
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)s] - %(message)s',
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device chosen: {device}')
logging.info(f'[INIT] Device chosen: {device}')




Device chosen: cuda


### Loading Dataset

In [3]:


# LOADING DATASET

# prepare rows not containing BLACK / CHECK to be inspected by brightness check and objective size estimation
csv_file = pd.read_csv(PATH_TO_LABELS)
csv_file['status'] = csv_file['status'].astype('str') # to ensure status (CHECK, WHITE, BLACK) is of type string

ignore_statuses = ['CHECK', 'BLACK', 'MISSING'] # these statuses are ignored in dataset
csv_file_filtered = csv_file[~csv_file['status'].isin(ignore_statuses)] # selects all samples which's status has not been set to CHECK or BLACK
csv_file_filtered.reset_index(drop=True, inplace=True)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match ResNet input size
    transforms.ToTensor(),          # Convert to tensor
])

full_dataset = LepidopteraDataset(csv_file=csv_file_filtered, root_dir=PATH_TO_IMAGES, transform=transform)


print(f'Dataset contains {len(full_dataset)} samples.')

[DATASET] In-memory index built.
Dataset contains 830870 samples.


In [3]:


def get_model(model_name):

    if model_name == 'DINOv2basic': 
        model = models.vit_b_16(weights=models.ViT_B_16_Weights.DEFAULT)
        model.head = nn.Identity()
    elif model_name == 'DINOv2large': 
        model = models.vit_l_16(weights=models.ViT_L_16_Weights.DEFAULT)
        model.head = nn.Identity()   

    # different resnets
    elif model_name == 'ResNet50_ImageNet1KV1':
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        model.fc = nn.Identity()
    elif model_name == 'ResNet50_ImageNet1KV2':
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
        model.fc = nn.Identity()
    elif model_name == 'ResNet101_ImageNet1KV1':
        model = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
        model.fc = nn.Identity() #
    elif model_name == 'ResNet101_ImageNet1KV2':
        model = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V2)
        model.fc = nn.Identity()
    else:
        raise ValueError("Unknown model name")
    return model


def extract_features_resnet(loader, model_name):
    print('Called Feature Extraction Function')
    model = get_model(model_name)
    model = model.to(device)
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        print('Starting feature extraction process...')
        for batch, (images, lbls, gbifids, _ ) in enumerate(loader, start=1):
            logging.info(f'[FEATURE EXTRACTION] Batch [{batch+1}/{len(loader)}] with {len(images)} samples')
            if (batch + 1) % 2 == 0: print(f'Batch [{batch+1}/{len(loader)}] with {len(images)} samples') 
            valid_indices = []

            for idx, lbl in enumerate(lbls): # sorting out images that could not be read
                if int(lbl) >= 0: # valid label if 0 or above, if negative, there has been an error with the sample
                    valid_indices.append(idx)
                else:
                    logging.error(f'[FEATURE EXTRACTION] Image with gbifID {gbifids[idx]} couldn\'t be read.')
                    print(f'[FEATURE EXTRACTION] Image with gbifID {gbifids[idx]} couldn\'t be read.')

            images = images[valid_indices]
            lbls = lbls[valid_indices]

            images = images.to(device)
            lbls = lbls.to(device)
            
            outputs = model(images).cpu().numpy()
            features.append(outputs)
            labels.append(lbls.cpu().numpy())
            if batch % 50 == 0:
                gc.collect() 
                torch.cuda.empty_cache()

    features = np.concatenate(features)
    labels = np.concatenate(labels)
    return features, labels

def save_features(features, labels, filename): 
    np.savez_compressed(filename, features=features, labels=labels) 
    print(f"Features and labels saved to {filename}") 
    
def load_features(filename): 
    data = np.load(filename) 
    features = data['features'] 
    labels = data['labels'] 
    print(f"Features and labels loaded from {filename}") 
    return features, labels


In [None]:

cv_train_val_indices, test_indices = train_test_split(range(len(full_dataset)), test_size=0.1, shuffle=True, random_state=42) # no stratification required, no class imbalance
cv_train_val_dataset = torch.utils.data.Subset(full_dataset, cv_train_val_indices)
test_dataset = torch.utils.data.Subset(full_dataset, test_indices)

cv_train_val_loader = DataLoader(cv_train_val_dataset, batch_size=1000, shuffle=False, num_workers=24, prefetch_factor=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=24, prefetch_factor=2, pin_memory=True)

#full_loader = DataLoader(full_dataset, batch_size=1000, shuffle=False, num_workers=24, prefetch_factor=2, pin_memory=True)

print(f'Model: {MODEL_NAME}')
print(f'Dataset: {DATASET_NAME}')
#Extract features for training and validation sets
cv_train_val_features, cv_train_val_labels = extract_features_resnet(cv_train_val_loader, MODEL_NAME)
PATH_TO_FEATURES_CV_TRAIN_VAL = PATH_TO_FEATURES + f'cv_train_val_features_{MODEL_NAME}_{DATASET_NAME}.npz'
save_features(cv_train_val_features, cv_train_val_labels, PATH_TO_FEATURES_CV_TRAIN_VAL)

test_features, test_labels = extract_features_resnet(test_loader, MODEL_NAME)
PATH_TO_FEATURES_TEST = PATH_TO_FEATURES + f'test_features_{MODEL_NAME}_{DATASET_NAME}.npz'
save_features(test_features, test_labels, PATH_TO_FEATURES_TEST)


Model: ResNet50_ImageNet1KV2
Dataset: top277_max3000
Called Feature Extraction Function
Starting feature extraction process...
[DATASET][ERROR] cannot identify image file '/mnt/data/lgierz/moth_dataset_top589_max3000/4887899402_89095802.jpg'
[DATASET][ERROR] cannot identify image file '/mnt/data/lgierz/moth_dataset_top589_max3000/4889518310_90140725.jpg'Batch [2/84] with 1000 samples

Batch [4/84] with 1000 samples
Batch [6/84] with 1000 samples
Batch [8/84] with 1000 samples
Batch [10/84] with 1000 samples
Batch [12/84] with 1000 samples




Batch [14/84] with 1000 samples
Batch [16/84] with 1000 samples
Batch [18/84] with 1000 samples
Batch [20/84] with 1000 samples
Batch [22/84] with 1000 samples
[FEATURE EXTRACTION] Image with gbifID 4887899402 couldn't be read.
[DATASET][ERROR] cannot identify image file '/mnt/data/lgierz/moth_dataset_top589_max3000/3730297094_35249935.jpg'
Batch [24/84] with 1000 samples
Batch [26/84] with 1000 samples
Batch [28/84] with 1000 samples
Batch [30/84] with 1000 samples
Batch [32/84] with 1000 samples
Batch [34/84] with 1000 samples
Batch [36/84] with 1000 samples
Batch [38/84] with 1000 samples
Batch [40/84] with 1000 samples
Batch [42/84] with 1000 samples
Batch [44/84] with 1000 samples
Batch [46/84] with 1000 samples
Batch [48/84] with 1000 samples
[FEATURE EXTRACTION] Image with gbifID 4889518310 couldn't be read.
Batch [50/84] with 1000 samples
Batch [52/84] with 1000 samples
Batch [54/84] with 1000 samples
Batch [56/84] with 1000 samples
Batch [58/84] with 1000 samples
Batch [60/84]

In [4]:
PATH_TO_FEATURES_CV_TRAIN_VAL = PATH_TO_FEATURES + f'cv_train_val_features_{MODEL_NAME}_{DATASET_NAME}.npz'

# skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
# knn = KNeighborsClassifier()
# grid_search = GridSearchCV(estimator=knn, param_grid=KNN_PARAM_GRID, scoring='accuracy', cv=skf, verbose=2, n_jobs=24)
loaded_features, loaded_labels = load_features(PATH_TO_FEATURES_CV_TRAIN_VAL)
# grid_search.fit(loaded_features, loaded_labels)
loaded_labels[0].shape

Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/cv_train_val_features_ResNet50_ImageNet1KV2_top277_max3000.npz


()

In [24]:
#x = loaded_features[0]

feature_set = set(map(tuple, loaded_features))
len(feature_set)

746787

In [6]:
loaded_labels[35]

201

In [5]:
mydf = {'feature': loaded_features.tolist(),
        'label': list(loaded_labels)}

df = pd.DataFrame(mydf)
df['feature'].value_counts()

: 

: 

In [None]:

print(f'Cross Validation performed with {len(cv_train_val_features)} features and {len(cv_train_val_labels)} labels.')


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")


# Evaluate the KNN classifier on the validation set using the best estimator found by GridSearchCV

print(f'Final test on best parameters performed with {len(test_features)} features and {len(test_labels)} labels.')
loaded_features, loaded_labels = load_features(PATH_TO_FEATURES_TEST)

test_predictions = grid_search.best_estimator_.predict(loaded_features)
accuracy = accuracy_score(loaded_labels, test_predictions)
print(f"Testing Accuracy: {accuracy:.4f}")

# save results of runs in processed/results

### TODO
- implement extensive logging and data gathering for later visualization
- separate PARAMS für Linear o.ä.? -> epochs
- wie kann ich results in csv katalogisieren/kategorisieren
- direkt Ergebnisse anschaulich darstellen?
- einzelne zellenergebnisse zwischenspeichern wenn auf workstation in einem script executed?
- andere datensätze erstellen
- DINOv2 Implementieren
- welche andere Optionen gibt es noch anstatt von DINO2 oder ResNet?
- welche params kommen für KNN CV in frage und warum?
- umap
- gewichtung ähnliche klassen nur bei linear?

### Todo Todday:
- create dataset resized for experiment
- dataset status updates (MISSING)
- dataset sort out dark samples (BLACK)

- send to device (GPU on Workstation)
- implement logging
- welche Versionen von ResNet und DINOv2?