In [2]:
import tensorflow as tf
import numpy as np
from pathlib import Path
import kagglehub
from scipy.spatial import distance

# Function to partition dataset
def get_dataset_partitions_tf(ds, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    ds_size = len(ds)
    if shuffle:
        ds = ds.shuffle(shuffle_size, seed=12)
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    train_ds = ds.take(train_size)
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    return train_ds, val_ds, test_ds

# Load dataset
path = kagglehub.dataset_download("emmarex/plantdisease")
base_path = Path(path) / "PlantVillage"
tomato_folders = [f for f in os.listdir(base_path) if f.startswith("Tomato")]

dataset = tf.keras.utils.image_dataset_from_directory(
    base_path,
    seed=123,
    shuffle=True,
    image_size=(256, 256),
    batch_size=32,
    class_names=tomato_folders
)

_, val_ds, _ = get_dataset_partitions_tf(dataset, train_split=0.8, val_split=0.1, test_split=0.1)

# Load model and build it
MODEL_PATH = "models/tomato_main.keras"
disease_model = tf.keras.models.load_model(MODEL_PATH)
# Build the model with a dummy input to define input/output
disease_model.build(input_shape=(None, 256, 256, 3))  # Match your image size and channels

# Feature extractor (before the last Dense layer)
feature_extractor = tf.keras.Model(
    inputs=disease_model.input,
    outputs=disease_model.layers[-2].output  # Before the final Dense
)

# Collect features and MSP
features = []
msp_values = []
for images, _ in val_ds:
    preds = disease_model.predict(images)
    features.append(feature_extractor.predict(images))
    msp_values.extend(np.max(preds, axis=1))

features = np.concatenate(features, axis=0)
msp_values = np.array(msp_values)

# Compute mean and covariance for Mahalanobis
mean_feature = np.mean(features, axis=0)
cov_feature = np.cov(features.T)

# Set thresholds (e.g., 5th percentile for MSP, 95th for Mahalanobis)
msp_threshold = np.percentile(msp_values, 5)  # Low MSP indicates OOD
mahala_distances = [distance.mahalanobis(f, mean_feature, np.linalg.inv(cov_feature)) for f in features]
mahala_threshold = np.percentile(mahala_distances, 95)  # High distance indicates OOD

# Save statistics
os.makedirs("models", exist_ok=True)
np.save("models/msp_threshold.npy", msp_threshold)
np.save("models/mahala_threshold.npy", mahala_threshold)
np.save("models/mean_feature.npy", mean_feature)
np.save("models/cov_feature.npy", cov_feature)

print(f"MSP Threshold: {msp_threshold}")
print(f"Mahalanobis Threshold: {mahala_threshold}")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'scipy'