In [1]:
import numpy as np
import pandas as pd

PATH_TO_DATA = '/home/lgierz/BA_MothClassification/data/'
PATH_TO_CA = PATH_TO_DATA + 'confidence_analysis/'
PATH_TO_DATASETS = PATH_TO_DATA + 'processed/cv_datasets/'
PATH_TO_LOGFILE = PATH_TO_DATA + 'status/confidence_tests1.log'
csv_file_path = PATH_TO_CA + 'results/all_results_uncorrected_labels.csv'

model_names = ["Linear Classifier", "KNN"]
fm_names = ['resnet', 'dino']

dataset_configs = {
    'top277': (277, [3000, 2000, 1000, 500]),
    'top387': (387, [2000, 1000, 500]),
    'top589': (589, [1000, 500])
}

### Combining all npz confidence results files to a single csv 
- with sorted top 10 confidences for each prediction
- label has not been corrected yet, e.g. 0-276 for classes 277, adjusted indices to all species

In [3]:
        

def load_predictions(filename): 
    data = np.load(filename) 
    gbifids = data['gbifids']
    labels = data['labels'] 
    predicted_labels = data['predicted_labels'] 
    confidences = data['confidances'] 

    print(f"Features and labels loaded from {filename}") 
    return gbifids, labels, predicted_labels, confidences

counter = 0
final_length = 0
for dataset, (species, sample_amounts) in dataset_configs.items():
    for samples in sample_amounts:
        for fm in fm_names:
            for model in model_names:
                counter +=1
                gbifids, labels, predicted_labels, confidences = load_predictions(PATH_TO_CA + f"predictions/predictions_max{species}top{samples}_{fm}_{model}.npz")
                
                # Ensure all arrays have the same length
                length = len(gbifids.flatten()) if (len(labels.flatten()) == len(predicted_labels.flatten())) and (len(predicted_labels.flatten()) == len(confidences)) else 0

                final_length += length

                #Create a DataFrame with the required columns
                df_predictions = pd.DataFrame({
                    'Species': [species] * length,
                    'Samples': [samples] * length,
                    'FoundationalModel': [fm] * length,
                    'Model': [model] * length,
                    'GbifID': gbifids.flatten(),
                    'Label': labels.flatten(),
                    'PredictedLabel': predicted_labels.flatten(),
                    'Confidence': [sorted([(i, x) for i, x in enumerate(confidences[j])], key=lambda item: item[1], reverse=True)[:10] for j in range(length)] # saves the 10 highest confidences (indexed)
                })          

                # if counter == 1: 
                #     df_predictions.to_csv(csv_file_path, mode='a', header=True, index=False)
                # else:
                #     df_predictions.to_csv(csv_file_path, mode='a', header=False, index=False)

                print(f'Added results {counter} (top{species}max{samples} for {fm.upper()} and {model}) contianing {length} rows to {csv_file_path}')
                print()
print(f'\nThe final length of the DataFrame should be {final_length}. Exiting...')


Features and labels loaded from /home/lgierz/BA_MothClassification/data/confidence_analysis/predictions/predictions_max277top3000_resnet_Linear Classifier.npz
Added results 1 (top277max3000 for RESNET and Linear Classifier) contianing 166174 rows to /home/lgierz/BA_MothClassification/data/confidence_analysis/results/all_results_uncorrected_labels.csv

Features and labels loaded from /home/lgierz/BA_MothClassification/data/confidence_analysis/predictions/predictions_max277top3000_resnet_KNN.npz
Added results 2 (top277max3000 for RESNET and KNN) contianing 166174 rows to /home/lgierz/BA_MothClassification/data/confidence_analysis/results/all_results_uncorrected_labels.csv

Features and labels loaded from /home/lgierz/BA_MothClassification/data/confidence_analysis/predictions/predictions_max277top3000_dino_Linear Classifier.npz
Added results 3 (top277max3000 for DINO and Linear Classifier) contianing 166174 rows to /home/lgierz/BA_MothClassification/data/confidence_analysis/results/all_re

### Correct labels

In [5]:
PATH_TO_LABELS = PATH_TO_DATA + 'processed/dataset_top589_max3000.csv'
PATH_TO_CONFS = PATH_TO_CA + 'results/all_results_uncorrected_labels.csv'
PATH_TO_NEW_CONFS = PATH_TO_CA + 'results/all_results_corrected_labels.csv'


labels_csv = pd.read_csv(PATH_TO_LABELS)
confidences_csv = pd.read_csv(PATH_TO_CONFS)

# create real mapping for labels_csv
species = labels_csv['scientificName']
unique_species = sorted(species.unique())
species_mapping = {species: idx for idx, species in enumerate(unique_species)}

print(unique_species)
print(species_mapping)
print(len(unique_species))

count = 0
new_df = pd.DataFrame()


for dataset, (species, sample_amounts) in dataset_configs.items():
    for samples in sample_amounts:
        correction_mapping = [-1] * 589
        confidences_csv_filtered = confidences_csv[(confidences_csv['Species'] == species) & (confidences_csv['Samples'] == samples)]
        for idx, row in confidences_csv_filtered.iterrows():
            # get label that was predicted, the label is wrongly mapped
            true_label_wrongly_mapped = row['Label']

            # if wrongly mapped label is not in correction mapping, map it to the correct label
            if not true_label_wrongly_mapped in correction_mapping:
                gbifid = row['GbifID']

                # get the true label
                true_label_name = labels_csv[labels_csv['gbifID'] == gbifid]['scientificName'].values[0]
                true_label = species_mapping[true_label_name]
                correction_mapping[true_label] = true_label_wrongly_mapped


        if correction_mapping.count(-1) <= (589 - species):
            print(correction_mapping)

            for idx, wrong_label in enumerate(correction_mapping):
                if wrong_label == -1:
                    continue
                else:
                    confidences_specific_species = confidences_csv_filtered[confidences_csv_filtered['Label'] == wrong_label]
                    
                    for idx, row in confidences_specific_species.iterrows():
                        row['Label'] = correction_mapping.index(row['Label'])
                        row['PredictedLabel'] = correction_mapping.index(row['PredictedLabel'])
                        row['Confidence'] = [(correction_mapping.index(i), x) for i, x in eval(row['Confidence'])]
                        confidences_specific_species.loc[idx] = row 
                        
                    count +=1

                    # if count == 1:
                    #     confidences_specific_species.to_csv(PATH_TO_NEW_CONFS, header=True, index=False)
                    # else:
                    #     confidences_specific_species.to_csv(PATH_TO_NEW_CONFS, mode='a', header=False, index=False)



['Abraxas grossulariata', 'Abraxas sylvata', 'Abrostola triplasia', 'Acasis viretata', 'Acentria ephemerella', 'Acleris forsskaleana', 'Acleris notana/ferrugana', 'Acleris variegana', 'Acontia lucida', 'Acontia trabealis', 'Acrobasis advenella', 'Acrobasis repandana', 'Acrobasis tumidana', 'Acronicta aceris', 'Acronicta leporina', 'Acronicta rumicis', 'Acronicta tridens / psi', 'Adela reaumurella', 'Adscita statices', 'Aethalura punctulata', 'Agapeta hamana', 'Agapeta zoegana', 'Aglais io', 'Aglais urticae', 'Aglossa pinguinalis', 'Agonopterix arenella', 'Agonopterix heracliana / ciliella', 'Agriopis aurantiaria', 'Agriopis leucophaearia', 'Agriopis marginaria', 'Agriphila geniculea', 'Agriphila inquinatella', 'Agriphila selasella', 'Agriphila straminella', 'Agriphila tristella', 'Agrius convolvuli', 'Agrochola circellaris', 'Agrochola lota', 'Agrochola lunosa', 'Agrochola lychnidis', 'Agrochola macilenta', 'Agrotis clavis', 'Agrotis exclamationis', 'Agrotis ipsilon', 'Agrotis puta', '

### Rerun with corrected labels to ensure correctness

In [6]:
PATH_TO_LABELS = PATH_TO_DATA + 'processed/dataset_top589_max3000.csv'
PATH_TO_CORRECTED_CONFS = PATH_TO_CA + 'results/all_results_corrected_labels.csv'
PATH_TO_NEW_CONFS = PATH_TO_CA + 'results/all_results_corrected_labels.csv'


labels_csv = pd.read_csv(PATH_TO_LABELS)
confidences_csv = pd.read_csv(PATH_TO_CORRECTED_CONFS)

# create real mapping for labels_csv
species = labels_csv['scientificName']
unique_species = sorted(species.unique())
species_mapping = {species: idx for idx, species in enumerate(unique_species)}

print(unique_species)
print(species_mapping)
print(len(unique_species))


for dataset, (species, sample_amounts) in dataset_configs.items():
    for samples in sample_amounts:
        correction_mapping = [-1] * 589
        confidences_csv_filtered = confidences_csv[(confidences_csv['Species'] == species) & (confidences_csv['Samples'] == samples)]
        for idx, row in confidences_csv_filtered.iterrows():
            # get label that was predicted, the label is wrongly mapped
            true_label_wrongly_mapped = row['Label']

            # if wrongly mapped label is not in correction mapping, map it to the correct label
            if not true_label_wrongly_mapped in correction_mapping:
                gbifid = row['GbifID']

                # get the true label
                true_label_name = labels_csv[labels_csv['gbifID'] == gbifid]['scientificName'].values[0]
                true_label = species_mapping[true_label_name]
                correction_mapping[true_label] = true_label_wrongly_mapped


        if correction_mapping.count(-1) <= (589 - species):
            print(correction_mapping)



['Abraxas grossulariata', 'Abraxas sylvata', 'Abrostola triplasia', 'Acasis viretata', 'Acentria ephemerella', 'Acleris forsskaleana', 'Acleris notana/ferrugana', 'Acleris variegana', 'Acontia lucida', 'Acontia trabealis', 'Acrobasis advenella', 'Acrobasis repandana', 'Acrobasis tumidana', 'Acronicta aceris', 'Acronicta leporina', 'Acronicta rumicis', 'Acronicta tridens / psi', 'Adela reaumurella', 'Adscita statices', 'Aethalura punctulata', 'Agapeta hamana', 'Agapeta zoegana', 'Aglais io', 'Aglais urticae', 'Aglossa pinguinalis', 'Agonopterix arenella', 'Agonopterix heracliana / ciliella', 'Agriopis aurantiaria', 'Agriopis leucophaearia', 'Agriopis marginaria', 'Agriphila geniculea', 'Agriphila inquinatella', 'Agriphila selasella', 'Agriphila straminella', 'Agriphila tristella', 'Agrius convolvuli', 'Agrochola circellaris', 'Agrochola lota', 'Agrochola lunosa', 'Agrochola lychnidis', 'Agrochola macilenta', 'Agrotis clavis', 'Agrotis exclamationis', 'Agrotis ipsilon', 'Agrotis puta', '

### Adding Scores to runs

In [None]:
#Path to the CSV file containing scores
PATH_TO_SCORES = PATH_TO_DATA + 'status/confidence_tests1.csv'

# Read the scores CSV file
scores_csv = pd.read_csv(PATH_TO_SCORES)
scores_csv = scores_csv[['Species','Samples','FoundationalModel','Model','Training Time (s)','Accuracy','Precision','Recall','F1-Score']]

# Merge the scores dataframe with the confidences_csv dataframe
merged_df = pd.merge(confidences_csv, scores_csv, on=['Species', 'Samples', 'FoundationalModel', 'Model'], how='right')

# Display the merged dataframe
merged_df.head(3)

In [10]:
columns = ['Species','Samples','FoundationalModel','Model','Training Time (s)','Accuracy','Precision','Recall','F1-Score','GbifID','Label','PredictedLabel','Confidence']
#merged_df[columns].to_csv(PATH_TO_CA + 'results/all_results_corrected_labels_w_scores.csv', header=True, index=False)