In [9]:
import numpy as np
import os
from skimage import io
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, Conv2D, Conv2DTranspose, MaxPooling2D, UpSampling2D
from tensorflow.keras import backend as K
from scipy.stats import ks_2samp
import tensorflow as tf

# Function to load images from directory
def load_images_from_directory(directory_path):
    images = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.tiff'):
            img_path = os.path.join(directory_path, filename)
            img = io.imread(img_path).astype(np.float32)  # Assuming images are already normalized
            images.append(img.reshape((207, 243, 1)))  # Add channel dimension for grayscale
    return np.array(images)

# Define an untrained autoencoder architecture suitable for your image dimensions
def build_autoencoder(input_shape=(207, 243, 1)):
    input_img = Input(shape=input_shape)
    
    # Encoder
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)
    
    # Decoder
    x = Conv2DTranspose(16, (3, 3), strides=2, activation='relu', padding='same')(encoded)
    x = Conv2DTranspose(32, (3, 3), strides=2, activation='relu', padding='same')(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
    
    autoencoder = Model(input_img, decoded)
    encoder = Model(input_img, encoded)
    
    return encoder, autoencoder

def detect_drift(encoded_current, encoded_new):
    # Flatten encoded representations to apply K-S test
    flat_current = encoded_current.reshape((encoded_current.shape[0], -1))
    flat_new = encoded_new.reshape((encoded_new.shape[0], -1))
    
    # Perform K-S test on flattened representations
    p_values = [ks_2samp(flat_current[:, i], flat_new[:, i]).pvalue for i in range(flat_current.shape[1])]
    
    return np.array(p_values)

# Set seed for reproducibility
tf.random.set_seed(0)

# Load images
current_data = load_images_from_directory('data/test_current_data')
new_data = load_images_from_directory('data/test_new_data')

# Build autoencoder
encoder, _ = build_autoencoder()

# Encode images (using the untrained encoder)
encoded_current = encoder.predict(current_data)
encoded_new = encoder.predict(new_data)

# Detect drift
p_values = detect_drift(encoded_current, encoded_new)
drift_detected = np.any(p_values < 0.05)

# Output results
print(f"Drift detected: {drift_detected}")
print(f"Number of features with drift detected: {np.sum(p_values < 0.05)} / {len(p_values)}")


2024-03-21 22:27:48.645499: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-21 22:27:48.658180: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-21 22:27:48.666843: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Drift detected: True
Number of features with drift detected: 20124 / 50752


In [11]:
import numpy as np
import os
from skimage import io
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, Conv2D, Conv2DTranspose, MaxPooling2D, UpSampling2D
from tensorflow.keras import backend as K
from scipy.stats import ks_2samp
import tensorflow as tf

# Function to load images from directory
def load_images_from_directory(directory_path):
    images = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.tiff'):
            img_path = os.path.join(directory_path, filename)
            img = io.imread(img_path).astype(np.float32)  # Assuming images are already normalized
            images.append(img.reshape((207, 243, 1)))  # Add channel dimension for grayscale
    return np.array(images)

# Define an untrained autoencoder architecture suitable for your image dimensions
def build_autoencoder(input_shape=(207, 243, 1)):
    input_img = Input(shape=input_shape)
    
    # Encoder
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)
    
    # Decoder
    x = Conv2DTranspose(16, (3, 3), strides=2, activation='relu', padding='same')(encoded)
    x = Conv2DTranspose(32, (3, 3), strides=2, activation='relu', padding='same')(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
    
    autoencoder = Model(input_img, decoded)
    encoder = Model(input_img, encoded)
    
    return encoder, autoencoder

def detect_drift(encoded_current, encoded_new):
    # Flatten encoded representations to apply K-S test
    flat_current = encoded_current.reshape((encoded_current.shape[0], -1))
    flat_new = encoded_new.reshape((encoded_new.shape[0], -1))
    
    # Perform K-S test on flattened representations
    p_values = [ks_2samp(flat_current[:, i], flat_new[:, i]).pvalue for i in range(flat_current.shape[1])]
    
    return np.array(p_values)

# Set seed for reproducibility
tf.random.set_seed(0)

# Load images
current_data = load_images_from_directory('data/processed/sorted_imgs')
new_data = load_images_from_directory('data/oasis/ct_norm_resized')

# Build autoencoder
encoder, _ = build_autoencoder()

# Encode images (using the untrained encoder)
encoded_current = encoder.predict(current_data)
encoded_new = encoder.predict(new_data)

# Detect drift
p_values = detect_drift(encoded_current, encoded_new)
drift_detected = np.any(p_values < 0.05)

# Output results
print(f"Drift detected: {drift_detected}")
print(f"Number of features with drift detected: {np.sum(p_values < 0.05)} / {len(p_values)}")


In [2]:
import numpy as np
from skimage import io
from scipy.stats import ks_2samp
from sklearn.decomposition import PCA
import os
from statsmodels.stats.multitest import multipletests  # For Bonferroni correction

def load_images_from_directory(directory_path):
    """
    Load all .tiff images from a directory and return them as a numpy array.
    Ensure images are loaded; otherwise, return None to indicate an issue.
    """
    images = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.tiff'):
            img_path = os.path.join(directory_path, filename)
            img = io.imread(img_path).astype(np.float32) / 255.0
            images.append(img.flatten())  # Flatten each image to treat all pixels as features
    if images:  # Check if any images were loaded
        return np.array(images)
    else:
        print(f"No images found in {directory_path}.")
        return None

def apply_dimensionality_reduction(data, n_components=None):
    """
    Apply PCA to the data. If n_components is not specified or exceeds the limit,
    it defaults to the min(n_samples, n_features).
    """
    if data is not None:
        if n_components is None or n_components > min(data.shape):
            n_components = min(data.shape)
        pca = PCA(n_components=n_components)
        return pca.fit_transform(data)
    else:
        return None

def detect_drift(current_data, new_data, alpha=0.05):
    """
    Perform feature-wise K-S tests and apply Bonferroni correction to the results.
    """
    if current_data is not None and new_data is not None:
        ks_results = [ks_2samp(current_data[:, i], new_data[:, i]).pvalue for i in range(current_data.shape[1])]
        reject_list, corrected_p_values, _, _ = multipletests(ks_results, alpha=alpha, method='bonferroni')
        return reject_list, corrected_p_values
    else:
        return None, None

# Load images
current_data = load_images_from_directory('data/test_current_data')
new_data = load_images_from_directory('data/test_new_data')

if current_data is not None and new_data is not None:
    # Apply dimensionality reduction
    current_data_reduced = apply_dimensionality_reduction(current_data)
    new_data_reduced = apply_dimensionality_reduction(new_data)

    # Detect drift
    reject_list, corrected_p_values = detect_drift(current_data_reduced, new_data_reduced)
    
    if reject_list is not None:
        # Interpret and aggregate results
        drift_detected = any(reject_list)
        print("Drift detected:" if drift_detected else "No significant drift detected.")
        print(f"Number of features with drift detected: {np.sum(reject_list)} / {len(reject_list)}")
    else:
        print("Drift detection could not be performed due to data issues.")
else:
    print("Image loading failed. Drift detection skipped.")


  explained_variance_ratio_ = explained_variance_ / total_var


Drift detected:
Number of features with drift detected: 1 / 10


In [1]:
import numpy as np
from skimage import io
from scipy.stats import ks_2samp
from sklearn.decomposition import PCA
import os
from statsmodels.stats.multitest import multipletests  # For Bonferroni correction
import random

def load_images_from_directory(directory_path):
    images = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.tiff'):
            img_path = os.path.join(directory_path, filename)
            img = io.imread(img_path).astype(np.float32) / 255.0
            images.append(img.flatten())
    if images:
        return np.array(images)
    else:
        print(f"No images found in {directory_path}.")
        return None

def apply_dimensionality_reduction(data, n_components=None):
    if data is not None:
        if n_components is None or n_components > min(data.shape):
            n_components = min(data.shape)
        pca = PCA(n_components=n_components)
        return pca.fit_transform(data)
    else:
        return None

def detect_drift(current_data, new_data, alpha=0.05):
    if current_data is not None and new_data is not None:
        ks_results = [ks_2samp(current_data[:, i], new_data[:, i]).pvalue for i in range(current_data.shape[1])]
        reject_list, corrected_p_values, _, _ = multipletests(ks_results, alpha=alpha, method='bonferroni')
        return reject_list, corrected_p_values
    else:
        return None, None

def random_sampling(data, sample_size):
    if data is None:
        return None
    if sample_size >= len(data):
        return data  # Return original data if sample size is greater than dataset size
    sample_indices = random.sample(range(len(data)), sample_size)
    return data[sample_indices]

# Parameters
sample_size = 10  # Define your sample size here

# Load images
current_data = load_images_from_directory('data/processed/sorted_imgs/train')
new_data = load_images_from_directory('data/oasis/ct_norm_resized')

# Random sampling
current_data_sampled = random_sampling(current_data, sample_size)
new_data_sampled = random_sampling(new_data, sample_size)

if current_data_sampled is not None and new_data_sampled is not None:
    # Apply dimensionality reduction
    current_data_reduced = apply_dimensionality_reduction(current_data_sampled)
    new_data_reduced = apply_dimensionality_reduction(new_data_sampled)

    # Detect drift
    reject_list, corrected_p_values = detect_drift(current_data_reduced, new_data_reduced)
    
    if reject_list is not None:
        drift_detected = any(reject_list)
        print("Drift detected:" if drift_detected else "No significant drift detected.")
        print(f"Number of features with drift detected: {np.sum(reject_list)} / {len(reject_list)}")
    else:
        print("Drift detection could not be performed due to data issues.")
else:
    print("Image loading or sampling failed. Drift detection skipped.")


In [3]:
import numpy as np
from skimage import io
import os
import random
from scipy.stats import ks_2samp
from sklearn.decomposition import PCA
from statsmodels.stats.multitest import multipletests
import shutil  # For copying files

def clear_directory(directory_path):
    """
    Removes all files in the specified directory.
    """
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

def sample_and_transfer_images(source_dir, target_dir, sample_size=100):
    """
    Sample a number of images from the source directory and transfer them to the target directory.
    Ensure the target directory is empty before transferring.
    """
    # Clear the target directory first
    clear_directory(target_dir)

    filenames = [filename for filename in os.listdir(source_dir) if filename.endswith('.tiff')]
    sampled_filenames = random.sample(filenames, min(sample_size, len(filenames)))

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    
    for filename in sampled_filenames:
        shutil.copy(os.path.join(source_dir, filename), os.path.join(target_dir, filename))
        
    print(f"Transferred {len(sampled_filenames)} images from {source_dir} to {target_dir}.")


def load_images_from_directory(directory_path):
    images = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.tiff'):
            img_path = os.path.join(directory_path, filename)
            img = io.imread(img_path).astype(np.float32) / 255.0
            images.append(img.flatten())
            
    if images:
        return np.array(images)
    else:
        print(f"No images found in {directory_path}.")
        return None

def apply_dimensionality_reduction(data, n_components=None):
    if data is not None:
        if n_components is None or n_components > min(data.shape):
            n_components = min(data.shape)
        pca = PCA(n_components=n_components)
        return pca.fit_transform(data)
    else:
        return None

def detect_drift(current_data, new_data, alpha=0.05):
    if current_data is not None and new_data is not None:
        ks_results = [ks_2samp(current_data[:, i], new_data[:, i]).pvalue for i in range(current_data.shape[1])]
        reject_list, corrected_p_values, _, _ = multipletests(ks_results, alpha=alpha, method='bonferroni')
        return reject_list, corrected_p_values
    else:
        return None, None

# Sample and transfer images to new directories for processing
current_data_dir = 'data/processed/sorted_imgs/train'
new_data_dir = 'data/oasis/ct_norm_resized'
sample_and_transfer_images(current_data_dir, 'data/test_current_data')
sample_and_transfer_images(new_data_dir, 'data/test_new_data')

# Load images
current_data = load_images_from_directory('data/test_current_data')
new_data = load_images_from_directory('data/test_new_data')

# Proceed with processing if data was successfully loaded
if current_data is not None and new_data is not None:
    current_data_reduced = apply_dimensionality_reduction(current_data)
    new_data_reduced = apply_dimensionality_reduction(new_data)

    reject_list, corrected_p_values = detect_drift(current_data_reduced, new_data_reduced)

    drift_detected = any(reject_list)
    print("Drift detected:" if drift_detected else "No significant drift detected.")
    print(f"Number of features with drift detected: {np.sum(reject_list)} / {len(reject_list)}")
else:
    print("Image loading failed. Drift detection skipped.")


Transferred 100 images from data/processed/sorted_imgs/train to data/test_current_data.
Transferred 100 images from data/oasis/ct_norm_resized to data/test_new_data.
Drift detected:
Number of features with drift detected: 46 / 100
