In [4]:
import time
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sys
import os
import logging
import torch
from torch import nn
import numpy as np
import pandas as pd
from torch.optim.lr_scheduler import ReduceLROnPlateau

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from utils_helpers import check_folder_exists, load_features, save_features


PATH_TO_DATA = '/home/lgierz/BA_MothClassification/data/'
PATH_TO_CA = PATH_TO_DATA + 'confidance_analysis/'
PATH_TO_DATASETS = PATH_TO_DATA + 'processed/cv_datasets/'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/hyperparameter_tuning_dino_chunky_additionalOOPLRscheduler.log'
feature_file = PATH_TO_DATA + 'processed/cv_datasets/dino_feature_dataset_top277_max1000.npz' 
csv_file_path = PATH_TO_DATA + 'status/hyperparameter_tuning_dino_chunky_additionalOOPLRscheduler.csv'

model_names = ["KNN", "Linear Classifier"]
fm_names = ['resnet', 'dino']


config = {    
    'pca__reduced_fe_size': 512,

    'knn__neighbors': 60,

    'linear__learning_rate': 0.001,
    'linear__epochs': 1500,
    'linear__patience': 3,
    'linear__gamma': 0.8
}

dataset_configs = {
    'top277': (277, [3000, 2000, 1000, 500]),
    'top387': (387, [2000, 1000, 500]),
    'top589': (589, [1000, 500])
}


# logging.basicConfig(
#     filename=PATH_TO_LOGFILE,
#     level=logging.INFO,
#     format='[%(asctime)s][%(levelname)s] - %(message)s',
# )

# console_handler = logging.StreamHandler() 
# console_handler.setLevel(logging.INFO) 
# console_handler.setFormatter(logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')) 
# logger = logging.getLogger() 
# logger.addHandler(console_handler)


### Dataset Split

In [5]:

for fm in fm_names:

    # Iterate through dataset configurations
    for dataset_name, (class_amount, sample_amounts) in dataset_configs.items():
        
        # Iterate through the sample amounts
        for sample_amount in sample_amounts:

            feature_file = PATH_TO_DATASETS + f'{fm}_feature_dataset_top{class_amount}_max{sample_amount}.npz'

            features, labels, gbifids = load_features(feature_file)

            X_train, X_test, y_train, y_test, gbifids_train, gbifids_test = train_test_split(features, labels, gbifids, test_size=0.2, random_state=42, stratify=labels)
            print(f'Lowest label: {np.min(y_train)}, highest label: {np.max(y_train)}, unique labels in training ds: {len(np.unique(y_train))}, unique labels in testing ds: {len(np.unique(y_test))}')
            print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}, gbifids_train shape: {gbifids_train.shape}, gbifids_test shape: {gbifids_test.shape}')
            save_features(X_train, y_train, gbifids_train, PATH_TO_CA + 'split_datasets/' + f'{fm}_feature_dataset_top{class_amount}_max{sample_amount}_train.npz')
            save_features(X_test, y_test, gbifids_test, PATH_TO_CA + 'split_datasets/' + f'{fm}_feature_dataset_top{class_amount}_max{sample_amount}_test.npz')



Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/cv_datasets/resnet_feature_dataset_top277_max3000.npz
Lowest label: 2, highest label: 588, unique labels in training ds: 277, unique labels in testing ds: 277
X_train shape: (664696, 2048), X_test shape: (166174, 2048), y_train shape: (664696,), y_test shape: (166174,), gbifids_train shape: (664696,), gbifids_test shape: (166174,)
Features and labels saved to /home/lgierz/BA_MothClassification/data/confidance_analysis/split_datasets/resnet_feature_dataset_top277_max3000_train.npz
Features and labels saved to /home/lgierz/BA_MothClassification/data/confidance_analysis/split_datasets/resnet_feature_dataset_top277_max3000_test.npz
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/cv_datasets/resnet_feature_dataset_top277_max2000.npz
Lowest label: 2, highest label: 588, unique labels in training ds: 277, unique labels in testing ds: 277
X_train shape: (443200, 2048), X_test sha

### Model Training and Extraction

In [1]:

# fix missing labels due to dataset splitting by changing label to ascending order
label_mapping = {label: idx for idx, label in enumerate(np.unique(labels))}
y_train = np.array([label_mapping[label] for label in y_train]) # since labels are not in ascending order, remapping is necessary
y_test = np.array([label_mapping[label] for label in y_test])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class LinearClassifier(nn.Module):
    def __init__(self, input_dim, num_classes): 
        super(LinearClassifier, self).__init__() 
        self.fc = nn.Linear(input_dim, num_classes) 
        
    def forward(self, x): 
        	return self.fc(x)

NameError: name 'load_features' is not defined

In [None]:
import pickle

reducer = PCA(n_components=config['pca__reduced_fe_size'])

start_time = time.time()
X_train_reduced = reducer.fit_transform(X_train)
X_test_reduced = reducer.transform(X_test) if hasattr(reducer, 'transform') else reducer.fit_transform(X_test)
reduction_time = time.time() - start_time

print(f'SHAPES: normal train: {X_train.shape} | reduced train: {X_train_reduced.shape}')
print(f'SHAPES: normal test: {X_test.shape} | reduced test: {X_test_reduced.shape}')

for model_name in model_names:

    if model_name == "Linear Classifier":
        
        losses, accuracies = [], []
        # PyTorch Model Setup
        input_dim = X_train_reduced.shape[1]
        num_classes = 277 # TODO: Change

        start_time = time.time()

        linear_model = LinearClassifier(input_dim, num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(linear_model.parameters(), lr=config['linear__learning_rate'])
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=config['linear__gamma'], patience=config['linear__patience'], min_lr=0.0001)

        X_train_tensor = torch.tensor(X_train_reduced, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)  # Convert to Long type for CrossEntropyLoss

        for epoch in range(config['linear__epochs']):
            linear_model.train()
            optimizer.zero_grad()

            outputs = linear_model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
        

            # calculate accuracy
            _, predicted = torch.max(outputs.data, 1)  # Get the predicted class indices
            correct = (predicted == y_train_tensor).sum().item()  # Count correct predictions
            accuracy = correct / y_train_tensor.size(0)  # Calculate accuracy

            losses.append(round(loss.item(), 4))
            accuracies.append(round(accuracy, 4))

        # Evaluate Linear Classifier
        linear_model.eval()
        X_test_tensor = torch.tensor(X_test_reduced, dtype=torch.float32).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)  # Convert to Long type for evaluation
        with torch.no_grad():
            outputs = linear_model(X_test_tensor)
            _, y_pred = torch.max(outputs, 1)  # Get the predicted class indices
            y_pred_numpy = y_pred.cpu().numpy()

        training_time = time.time() - start_time

        # Save the linear model
        with open(PATH_TO_DATA + 'linear_model.pkl', 'wb') as f:
            pickle.dump(linear_model, f)

    elif model_name == 'KNN':
        start_time = time.time()

        model = KNeighborsClassifier(n_neighbors=config['knn__neighbors'])
        model.fit(X_train_reduced, y_train)
        y_pred_numpy = model.predict(X_test_reduced)
        training_time = time.time() - start_time

        # Save the KNN model
        with open(PATH_TO_DATA + 'knn_model.pkl', 'wb') as f:
            pickle.dump(model, f)

    else:
        print(f'INVALID MODEL NAME: {model_name}')
        sys.exit(1)


### Confidance tests

### Implement:
- Class lookup using gbifid
- Extraction of Models for each Dataset
- Save confidances during training!