In [1]:
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
from PIL import Image
import pandas as pd

In [2]:
# Load dataset and preprocess images
def load_images_and_labels(image_path, image_size=(224, 224)):
    """Load images and extract family, genus, and species labels from directory structure."""
    images, families, genera, species = [], [], [], []

    for root, _, files in os.walk(image_path):
        for file in files:
            if file.lower().endswith(('jpg', 'jpeg', 'png')):
                img_path = os.path.join(root, file)
                img = Image.open(img_path).resize(image_size)
                images.append(np.array(img))

                # Extract labels from folder structure
                folder_name = os.path.basename(root)
                try:
                    family, genus, specie = folder_name.split("_")
                except ValueError:
                    raise ValueError(
                        f"Folder name '{folder_name}' does not follow 'family_genus_species' format."
                    )
                families.append(family)
                genera.append(genus)
                species.append(f"{genus} {specie}")

    images = np.array(images)
    labels = pd.DataFrame({
        'family': families,
        'genus': genera,
        'species': species
    })
    return images, labels

In [12]:
# Example dataset path
data_path = "/Users/leonardo/Library/CloudStorage/Box-Box/CryptoVision/Data/fish_functions/Species_v03"  # Replace with actual path
image_size = (64, 64)  # Adjust based on dataset

# Load images and labels
images, labels = load_images_and_labels(data_path, image_size=image_size)

KeyboardInterrupt: 

In [None]:
# Encode labels
label_encoders = {}
encoded_labels = {}
for level in ['family', 'genus', 'species']:
    label_encoders[level] = {label: idx for idx, label in enumerate(sorted(labels[level].unique()))}
    encoded_labels[level] = labels[level].map(label_encoders[level]).values


In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    images.reshape(len(images), -1),
    np.column_stack([encoded_labels[level] for level in ['family', 'genus', 'species']]),
    test_size=0.2,
    random_state=42,
    stratify=encoded_labels['species']
)

In [9]:
metrics = {}
for i, level in enumerate(['family', 'genus', 'species']):
    print(f"\nTraining DummyClassifier for {level} level")
    dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
    dummy_clf.fit(X_train, y_train[:, i])

    # Make predictions
    y_pred = dummy_clf.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test[:, i], y_pred)
    precision = precision_score(y_test[:, i], y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test[:, i], y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test[:, i], y_pred, average="weighted")

    # Store metrics
    metrics[level] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


Training DummyClassifier for family level
Accuracy: 0.1153
Precision: 0.1130
Recall: 0.1153
F1 Score: 0.1141

Training DummyClassifier for genus level
Accuracy: 0.0244
Precision: 0.0245
Recall: 0.0244
F1 Score: 0.0243

Training DummyClassifier for species level
Accuracy: 0.0266
Precision: 0.0269
Recall: 0.0266
F1 Score: 0.0266


In [10]:
print("\nOverall Metrics:")
for level, metric in metrics.items():
    print(f"{level.capitalize()} Level:")
    for metric_name, value in metric.items():
        print(f"  {metric_name.capitalize()}: {value:.4f}")


Overall Metrics:
Family Level:
  Accuracy: 0.1153
  Precision: 0.1130
  Recall: 0.1153
  F1: 0.1141
Genus Level:
  Accuracy: 0.0244
  Precision: 0.0245
  Recall: 0.0244
  F1: 0.0243
Species Level:
  Accuracy: 0.0266
  Precision: 0.0269
  Recall: 0.0266
  F1: 0.0266


In [2]:
import os

source_path = '/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset'
new_path = '/Volumes/T7_shield/CryptoVision/Data/others/hemingson_reviewed'

for folder in os.listdir(source_path):
    if folder.startswith("."):
        continue
    
    os.makedirs(os.path.join(new_path, folder), exist_ok=True)