In [1]:
import time
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sys
import os
import logging
import torch
from torch import nn
import numpy as np
import pandas as pd
# import cuml
# from cuml.manifold import UMAP



sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from utils import check_folder_exists, load_features, save_features


In [7]:

PATH_TO_DATA = '/home/lgierz/BA_MothClassification/data/'
PATH_TO_FEATURES = PATH_TO_DATA + 'processed/features/'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/hyperparameter_tuning_resnet_chunky.log'
feature_file = PATH_TO_DATA + 'processed/cv_datasets/resnet_feature_dataset_top277_max3000.npz' 
csv_file_path = PATH_TO_DATA + 'status/hyperparameter_tuning_resnet_chunky.csv'

folds = 5
model_names = ["Linear Classifier", "KNN"]

config = {
    'pca__reduced_fe_size': [256,512,1024],


    'umap__reduced_fe_size': [32,64,128,256,512,1024],
    'umap__neighbors': [10,50,100,300,500],
    'umap__leaf_size': [10,50,100],


    'linear__learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'linear__epochs': [250,500,750,1000,1500],


    'knn__neighbors': [10,50,100,300,500,700,1000],
}

In [4]:
logging.basicConfig(
    filename=PATH_TO_LOGFILE,
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)s] - %(message)s',
)

console_handler = logging.StreamHandler() 
console_handler.setLevel(logging.INFO) 
console_handler.setFormatter(logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')) 
logger = logging.getLogger() 
logger.addHandler(console_handler)

In [5]:
features, labels, _ = load_features(feature_file)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

print(f'Lowest label: {np.min(y_train)}, highest label: {np.max(y_train)}, unique labels in training ds: {len(np.unique(y_train))}, unique labels in testing ds: {len(np.unique(y_test))}')

# fix missing labels due to dataset splitting by changing label to ascending order
label_mapping = {label: idx for idx, label in enumerate(np.unique(labels))}
y_train = np.array([label_mapping[label] for label in y_train]) # since labels are not in ascending order, remapping is necessary
y_test = np.array([label_mapping[label] for label in y_test])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/cv_datasets/resnet_feature_dataset_top277_max3000.npz
Lowest label: 2, highest label: 588, unique labels in training ds: 277, unique labels in testing ds: 277


In [10]:



# TODO:
# - limit to important tests
# - add other feature reduction tests
# - separate feature reduction tests from other tests?


def handle_results(csv_path, test, pred, method, model, params, reduction_time, training_time, neighbors=None, lr=None, epochs=None, losses=None, accuracies=None):
    acc = accuracy_score(test, pred)
    prec = precision_score(test, pred, average='weighted')
    rec = recall_score(test, pred, average='weighted')
    f1 = f1_score(test, pred, average='weighted')

    # Store Results
    run_results = {
        "Method": method,
        "Model": model,
        "Parameters": params,
        "Reduction Time (s)": round(reduction_time, 2),
        "Training Time (s)": round(training_time, 2),
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1-Score": round(f1, 4),
        "Neighbors": neighbors if model == 'KNN' else None,
        "Learning Rate": lr if model == "Linear Classifier" else None,
        "Epochs": epochs if model == "Linear Classifier" else None,
        "Epoch Losses": losses if model == "Linear Classifier" else None,
        "Epoch Accuracies": accuracies if model == "Linear Classifier" else None
    }
    # Convert the dictionary to a DataFrame
    results_df = pd.DataFrame([run_results])  # Wrap in a list to create a DataFrame with one row
    if not os.path.isfile(csv_path):
        results_df.to_csv(csv_path, index=False)
    else:
        results_df.to_csv(csv_path, mode='a', header=False, index=False)

    logging.info(f"[{run_results['Method']} ({run_results['Parameters']})][{run_results['Model']}] Reduction time: {run_results['Reduction Time (s)']}s, Training time: {run_results['Training Time (s)']}s, Accuracy: {run_results['Accuracy']}, Precision: {run_results['Precision']}, Recall: {run_results['Recall']}, F1-Score: {run_results['F1-Score']}")


class LinearClassifier(nn.Module):
    def __init__(self, input_dim, num_classes): 
        super(LinearClassifier, self).__init__() 
        self.fc = nn.Linear(input_dim, num_classes) 
        
    def forward(self, x): 
        	return self.fc(x)

In [None]:


umap_configs = [
    {'n_components': rfs, 'n_neighbors': nn, 'leaf_size': ls, 'random_state': 42}
    for rfs in config['umap__reduced_fe_size']
    for nn in config['umap__neighbors']
    for ls in config['umap__leaf_size']
]

pca_configs = [
    {'n_components': rfs, 'random_state': 42}
    for rfs in config['pca__reduced_fe_size']
]

fe_reduction_configs = {
    'PCA': pca_configs,
    'UMAP': umap_configs
}


for reducer_name, param_list in fe_reduction_configs.items():
    for params in param_list:
        if reducer_name == 'PCA':
            reducer = PCA(**params)
        elif reducer_name == 'UMAP':
            reducer = UMAP(**params)
        print(f"Applying {reducer_name}...")
        start_time = time.time()
        X_train_reduced = reducer.fit_transform(X_train)
        X_test_reduced = reducer.transform(X_test) if hasattr(reducer, 'transform') else reducer.fit_transform(X_test)
        reduction_time = time.time() - start_time

        print(f'SHAPES: normal train: {X_train.shape} | reduced train: {X_train_reduced.shape}')
        print(f'SHAPES: normal test: {X_test.shape} | reduced test: {X_test_reduced.shape}')

        for model_name in model_names:

            if model_name == "Linear Classifier":
                for epochs in config['linear__epochs']:
                    for lr in config['linear__learning_rate']:
                    
                        print(f"Training {model_name} with {reducer_name} feature embeddings [LR: {lr} | Epochs: {epochs}]")
                        losses, accuracies = [], []
                        # PyTorch Model Setup
                        input_dim = X_train_reduced.shape[1]
                        num_classes = 277

                        start_time = time.time()

                        linear_model = LinearClassifier(input_dim, num_classes).to(device)
                        criterion = nn.CrossEntropyLoss()
                        #optimizer = torch.optim.SGD(linear_model.parameters(), lr=0.01)
                        optimizer = torch.optim.Adam(linear_model.parameters(), lr=lr)
                        X_train_tensor = torch.tensor(X_train_reduced, dtype=torch.float32).to(device)
                        y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)  # Convert to Long type for CrossEntropyLoss

                        for epoch in range(epochs):
                            linear_model.train()
                            optimizer.zero_grad()

                            outputs = linear_model(X_train_tensor)
                            loss = criterion(outputs, y_train_tensor)
                            loss.backward()
                            optimizer.step()

                            # calculate accuracy
                            _, predicted = torch.max(outputs.data, 1)  # Get the predicted class indices
                            correct = (predicted == y_train_tensor).sum().item()  # Count correct predictions
                            accuracy = correct / y_train_tensor.size(0)  # Calculate accuracy

                            losses.append(round(loss.item(), 4))
                            accuracies.append(round(accuracy, 4))

                        # Evaluate Linear Classifier
                        linear_model.eval()
                        X_test_tensor = torch.tensor(X_test_reduced, dtype=torch.float32).to(device)
                        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)  # Convert to Long type for evaluation
                        with torch.no_grad():
                            outputs = linear_model(X_test_tensor)
                            _, y_pred = torch.max(outputs, 1)  # Get the predicted class indices
                            y_pred_numpy = y_pred.cpu().numpy()

                        training_time = time.time() - start_time

                        handle_results(csv_file_path, y_test, y_pred_numpy, reducer_name, model_name, 
                                    params, reduction_time, training_time, neighbors=None, 
                                    lr=lr, epochs=epochs, accuracies=accuracies, losses=losses)

            elif model_name == 'KNN':
                for neighbors in config['knn__neighbors']:
                    print(f"Training {model_name} with {reducer_name} feature embeddings [K: {neighbors}]")
                    start_time = time.time()

                    model = KNeighborsClassifier(n_neighbors=neighbors)
                    model.fit(X_train_reduced, y_train)
                    y_pred_numpy = model.predict(X_test_reduced)
                    training_time = time.time() - start_time

                    handle_results(csv_file_path, y_test, y_pred_numpy, reducer_name, model_name, 
                                params, reduction_time, training_time, neighbors=neighbors, 
                                lr=None, epochs=None, accuracies=None, losses=None)

            else:
                print(f'UNVALID MODEL NAME: {model_name}')
                sys.exit(1)




Applying PCA...
SHAPES: normal train: (664696, 2048) | reduced train: (664696, 256)
SHAPES: normal test: (166174, 2048) | reduced test: (166174, 256)
Training Linear Classifier with PCA feature embeddings [LR: 0.1 | Epochs: 250]


[2024-12-19 14:13:30,218][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 13.23s, Accuracy: 0.6692, Precision: 0.6665, Recall: 0.6692, F1-Score: 0.6672


Training Linear Classifier with PCA feature embeddings [LR: 0.01 | Epochs: 250]


[2024-12-19 14:13:43,434][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 12.96s, Accuracy: 0.6614, Precision: 0.6592, Recall: 0.6614, F1-Score: 0.6595


Training Linear Classifier with PCA feature embeddings [LR: 0.001 | Epochs: 250]


[2024-12-19 14:13:56,669][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 12.97s, Accuracy: 0.5475, Precision: 0.5463, Recall: 0.5475, F1-Score: 0.5431


Training Linear Classifier with PCA feature embeddings [LR: 0.0001 | Epochs: 250]


[2024-12-19 14:14:09,965][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 13.02s, Accuracy: 0.1963, Precision: 0.2074, Recall: 0.1963, F1-Score: 0.1809


Training Linear Classifier with PCA feature embeddings [LR: 0.1 | Epochs: 500]


[2024-12-19 14:14:36,321][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 26.1s, Accuracy: 0.6693, Precision: 0.6666, Recall: 0.6693, F1-Score: 0.6673


Training Linear Classifier with PCA feature embeddings [LR: 0.01 | Epochs: 500]


[2024-12-19 14:15:03,466][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 26.88s, Accuracy: 0.6664, Precision: 0.6638, Recall: 0.6664, F1-Score: 0.6644


Training Linear Classifier with PCA feature embeddings [LR: 0.001 | Epochs: 500]


[2024-12-19 14:15:30,025][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 26.31s, Accuracy: 0.6041, Precision: 0.6024, Recall: 0.6041, F1-Score: 0.6013


Training Linear Classifier with PCA feature embeddings [LR: 0.0001 | Epochs: 500]


[2024-12-19 14:15:56,587][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 26.31s, Accuracy: 0.3369, Precision: 0.3607, Recall: 0.3369, F1-Score: 0.3233


Training Linear Classifier with PCA feature embeddings [LR: 0.1 | Epochs: 750]


[2024-12-19 14:16:36,322][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 39.48s, Accuracy: 0.6691, Precision: 0.6666, Recall: 0.6691, F1-Score: 0.6671


Training Linear Classifier with PCA feature embeddings [LR: 0.01 | Epochs: 750]


[2024-12-19 14:17:16,165][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 39.56s, Accuracy: 0.6678, Precision: 0.6652, Recall: 0.6678, F1-Score: 0.6659


Training Linear Classifier with PCA feature embeddings [LR: 0.001 | Epochs: 750]


[2024-12-19 14:17:56,944][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 40.52s, Accuracy: 0.6264, Precision: 0.6245, Recall: 0.6264, F1-Score: 0.6241


Training Linear Classifier with PCA feature embeddings [LR: 0.0001 | Epochs: 750]


[2024-12-19 14:18:36,713][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 39.52s, Accuracy: 0.4059, Precision: 0.4192, Recall: 0.4059, F1-Score: 0.395


Training Linear Classifier with PCA feature embeddings [LR: 0.1 | Epochs: 1000]


[2024-12-19 14:19:29,689][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 52.7s, Accuracy: 0.669, Precision: 0.6665, Recall: 0.669, F1-Score: 0.6669


Training Linear Classifier with PCA feature embeddings [LR: 0.01 | Epochs: 1000]


[2024-12-19 14:20:22,647][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 52.7s, Accuracy: 0.6687, Precision: 0.666, Recall: 0.6687, F1-Score: 0.6667


Training Linear Classifier with PCA feature embeddings [LR: 0.001 | Epochs: 1000]


[2024-12-19 14:21:15,664][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 52.76s, Accuracy: 0.6391, Precision: 0.6371, Recall: 0.6391, F1-Score: 0.6369


Training Linear Classifier with PCA feature embeddings [LR: 0.0001 | Epochs: 1000]


[2024-12-19 14:22:08,629][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 52.71s, Accuracy: 0.4444, Precision: 0.4536, Recall: 0.4444, F1-Score: 0.4349


Training Linear Classifier with PCA feature embeddings [LR: 0.1 | Epochs: 1500]


[2024-12-19 14:23:30,000][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 81.12s, Accuracy: 0.6688, Precision: 0.6662, Recall: 0.6688, F1-Score: 0.6666


Training Linear Classifier with PCA feature embeddings [LR: 0.01 | Epochs: 1500]


[2024-12-19 14:24:49,286][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 79.03s, Accuracy: 0.6691, Precision: 0.6664, Recall: 0.6691, F1-Score: 0.6671


Training Linear Classifier with PCA feature embeddings [LR: 0.001 | Epochs: 1500]


[2024-12-19 14:26:10,366][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 80.8s, Accuracy: 0.6526, Precision: 0.6506, Recall: 0.6526, F1-Score: 0.6506


Training Linear Classifier with PCA feature embeddings [LR: 0.0001 | Epochs: 1500]


[2024-12-19 14:27:29,610][INFO] - [PCA ({'n_components': 256, 'random_state': 42})][Linear Classifier] Reduction time: 42.02s, Training time: 78.97s, Accuracy: 0.4942, Precision: 0.4943, Recall: 0.4942, F1-Score: 0.4876


Training KNN with PCA feature embeddings [K: 10]


In [None]:
import matplotlib.pyplot as plt

for run in results:
    loss = [x / run['Epoch Losses'][0] * 100 for x in run['Epoch Losses']]
    acc = [x * 100 for x in run['Epoch Accuracies']]
 
    plt.plot(loss)
    plt.plot(acc)
    plt.show()
    plt.close()

In [None]:
# TODO: OPTUNA
https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction