In [13]:
import numpy as np 
import pandas as pd 
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from lepidoptera_dataset import LepidopteraDataset, get_labelencoding
from utils_helpers import show_sample, check_folder_exists, save_features, load_features

PATH_TO_DATA = "/home/lgierz/BA_MothClassification/data/"
PATH_TO_FEATURES = PATH_TO_DATA + 'processed/features/'



In [3]:
feature_file1 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File1.npz'
feature_file2 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File2.npz'
feature_file3 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File3.npz'

for idx, file in enumerate([feature_file1, feature_file2, feature_file3], start=1):
    print('Attributes of feature file Nr: ', idx)
    f, l, g = load_features(file)
    print(len(f), f.shape, len(set(l)), len(set(g)))


Attributes of feature file Nr:  1
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File1.npz
220000 (220000, 1536) 92 220000
Attributes of feature file Nr:  2
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File2.npz
760000 (760000, 1536) 325 760000
Attributes of feature file Nr:  3
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File3.npz
407808 (407808, 1536) 174 407808


In [6]:
feature_file1 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File1.npz'
feature_file2 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File2.npz'
feature_file3 = PATH_TO_FEATURES + 'features_DINOv2_top589_max3000_File3.npz'

f1, l1, g1 = load_features(feature_file1)
f2, l2, g2 = load_features(feature_file2)
f3, l3, g3 = load_features(feature_file3)

features = np.concatenate((f1, f2, f3), axis=0) 
labels = np.concatenate((l1, l2, l3), axis=0) 
gbifids = np.concatenate((g1, g2, g3), axis=0)

save_features(features, labels, gbifids, 'DINOv2_Features_Full.npz')


Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File1.npz
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File2.npz
Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/features_DINOv2_top589_max3000_File3.npz
Features and labels saved to DINOv2_Features_Full.npz


In [8]:
f, l, g = load_features(PATH_TO_FEATURES + 'DINOv2_Features_Full.npz')
print(len(f), f.shape, len(set(l)), len(set(g)))


Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/DINOv2_Features_Full.npz
1387808 (1387808, 1536) 325 1387808


In [4]:
DATASET = PATH_TO_FEATURES + 'dataset_top589_max3000_fex_statusupdate.csv'

csv_file = pd.read_csv(DATASET)
csv_file['status'] = csv_file['status'].astype('str')
csv_file_filtered = csv_file[csv_file['status'].isin(['DFEX1'])]

### Correct labels using gbifid and original csv file

In [14]:
def update_labels(features, gbifids, df, output_file): 
    label_mapping = df.set_index('gbifID')['scientificName'].to_dict() 
    # Map scientific names to integers 
    encoding = get_labelencoding(df)
    # Update the labels based on the encoding 
    new_labels = np.array([encoding[label_mapping[gbifid]] for gbifid in gbifids]) 
    # Save the updated features and labels 
    save_features(features, new_labels, gbifids, output_file)



In [15]:
new_feature_file = PATH_TO_FEATURES + 'DINOv2_Features_Full_CorrectlyLabeled.npz'
update_labels(f,g, csv_file_filtered, new_feature_file)

Features and labels saved to /home/lgierz/BA_MothClassification/data/processed/features/DINOv2_Features_Full_CorrectlyLabeled.npz


In [18]:
f, l, g = load_features(PATH_TO_FEATURES + 'DINOv2_Features_Full_CorrectlyLabeled.npz')
print(len(f), f.shape, len(set(l)), len(set(g)))

Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/DINOv2_Features_Full_CorrectlyLabeled.npz
1387808 (1387808, 1536) 589 1387808


### Create feature datasets

In [20]:
import numpy as np
import pandas as pd

# Dataset configuration
dataset_configs = {
    'top277': (277, [3000, 2000, 1000, 500]),
    'top387': (387, [2000, 1000, 500]),
    'top589': (589, [1000, 500])
}

# Path to the existing features NPZ file
feature_file = PATH_TO_FEATURES + 'DINOv2_Features_Full_CorrectlyLabeled.npz'
features, labels, gbifids = load_features(feature_file)

# Iterate through dataset configurations
for dataset_name, (class_amount, sample_amounts) in dataset_configs.items():
    
    
    # Iterate through the sample amounts
    for amount in sample_amounts:

        csv_file = PATH_TO_DATA + f'processed/cv_datasets/dataset_top{class_amount}_max{amount}.csv'
        dataset_df = pd.read_csv(csv_file)
        # Filter dataset based on the number of samples
        filtered_gbifids = dataset_df['gbifID'].values 
        # Filter features, labels, and gbifids based on the filtered gbifIDs 
        indices = np.isin(gbifids, filtered_gbifids) 
        new_features = features[indices] 
        new_labels = labels[indices] 
        new_gbifids = gbifids[indices] 

        
        # Save the new filtered dataset 
        output_file = PATH_TO_DATA + f'processed/cv_datasets/dino_feature_dataset_top{class_amount}_max{amount}.npz' 
        save_features(new_features, new_labels, new_gbifids, output_file) 
        print(f'Labels: {len(set(new_labels))} | gbifIDs: {len(set(new_gbifids))} | Samples: {len(new_features)} | Saved to: {output_file}')


Features and labels loaded from /home/lgierz/BA_MothClassification/data/processed/features/DINOv2_Features_Full_CorrectlyLabeled.npz
Features and labels saved to /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_max3000.npz
Labels: 277 | gbifIDs: 830870 | Samples: 830870 | Saved to: /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_max3000.npz
Features and labels saved to /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_max2000.npz
Labels: 277 | gbifIDs: 554000 | Samples: 554000 | Saved to: /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_max2000.npz
Features and labels saved to /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_max1000.npz
Labels: 277 | gbifIDs: 277000 | Samples: 277000 | Saved to: /home/lgierz/BA_MothClassification/data/processed/cv_datasets/dino_feature_dataset_top277_