In [9]:
import os
import glob
import numpy as np
from PIL import Image
from scipy.linalg import sqrtm
from scipy.stats import entropy
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
import torch
from pytorch_msssim import ms_ssim

# ---------------
# Helper Functions
# ---------------

def load_image(image_path, target_size=None):
    """
    Load an image and optionally resize it.
    Returns the image as a numpy array normalized to [0,1].
    """
    img = Image.open(image_path).convert('RGB')
    if target_size is not None:
        img = img.resize(target_size)
    return np.array(img) / 255.0

def dummy_feature_extraction(image, feature_dim=2048):
    """
    Dummy feature extraction by flattening the image.
    In practice, replace with a pretrained network (e.g., Inception).
    """
    features = image.flatten()
    if features.shape[0] < feature_dim:
        pad_width = feature_dim - features.shape[0]
        features = np.pad(features, (0, pad_width), mode='constant')
    else:
        features = features[:feature_dim]
    return features

def dummy_classifier_probs(image, num_classes=10, bias=None):
    """
    Simulate classifier probabilities using a Dirichlet distribution.
    In a real-world setting, these would be the softmax outputs of a trained classifier.
    Optionally, bias the distribution toward a particular class.
    """
    alpha = np.ones(num_classes) * 0.5
    if bias is not None:
        alpha[bias] = 5.0
    return np.random.dirichlet(alpha)

def polynomial_kernel(X, Y, degree=3, coef0=1):
    """
    A simple polynomial kernel used in the KID computation.
    """
    return (X.dot(Y.T) + coef0) ** degree

def mse_score(img1, img2):
    """
    Compute the Mean Squared Error (MSE) between two images.
    Lower values indicate higher similarity.
    """
    return np.mean((img1 - img2) ** 2)

def mae_score(img1, img2):
    """
    Compute the Mean Absolute Error (MAE) between two images.
    Lower values indicate higher similarity.
    """
    return np.mean(np.abs(img1 - img2))

def ms_ssim_score_pytorch(real_img, gen_img):
    """
    Compute the Multi-Scale Structural Similarity (MS-SSIM) between two images using PyTorch.
    Both images should be NumPy arrays with values in [0, 1] and shape (H, W, C).
    The function converts them to PyTorch tensors with shape (1, C, H, W).
    """
    # Convert images from NumPy to torch tensor and rearrange dimensions to (B, C, H, W)
    real_tensor = torch.tensor(real_img.transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0)
    gen_tensor = torch.tensor(gen_img.transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0)
    # Compute MS-SSIM; returns a tensor of shape (1,)
    msssim_val = ms_ssim(gen_tensor, real_tensor, data_range=1.0)
    return msssim_val.item()

# --------------------------
# Evaluation Class Definition
# --------------------------

class SupervisedGANMetricsEvaluator:
    def __init__(self, real_features, gen_features, gen_probs):
        """
        real_features: np.array of shape (N, D) for label images.
        gen_features: np.array of shape (N, D) for generated images.
        gen_probs: np.array of shape (N, C) simulated classifier probabilities.
        """
        self.real_features = real_features
        self.gen_features = gen_features
        self.gen_probs = gen_probs

    def inception_score(self):
        """
        Compute the Inception Score (IS). Higher is better.
        """
        p_y = np.mean(self.gen_probs, axis=0)
        kl_divs = [entropy(p, p_y) for p in self.gen_probs]
        return np.exp(np.mean(kl_divs))
    
    def fid_score(self):
        """
        Compute the Fréchet Inception Distance (FID). Lower is better; ideal is 0.
        """
        mu_r = np.mean(self.real_features, axis=0)
        mu_g = np.mean(self.gen_features, axis=0)
        cov_r = np.cov(self.real_features, rowvar=False)
        cov_g = np.cov(self.gen_features, rowvar=False)
        
        diff = mu_r - mu_g
        diff_squared = diff.dot(diff)
        covmean, _ = sqrtm(cov_r.dot(cov_g), disp=False)
        if np.iscomplexobj(covmean):
            covmean = covmean.real
        return diff_squared + np.trace(cov_r + cov_g - 2 * covmean)
    
    def kid_score(self, kernel_func, n_subsets=10, subset_size=5):
        """
        Compute the Kernel Inception Distance (KID). Lower is better; ideal is 0.
        """
        mmds = []
        n_real = self.real_features.shape[0]
        n_gen = self.gen_features.shape[0]
        for _ in range(n_subsets):
            idx_real = np.random.choice(n_real, subset_size, replace=True)
            idx_gen = np.random.choice(n_gen, subset_size, replace=True)
            X = self.real_features[idx_real]
            Y = self.gen_features[idx_gen]
            
            k_xx = kernel_func(X, X)
            k_yy = kernel_func(Y, Y)
            k_xy = kernel_func(X, Y)
            
            mmd = np.mean(k_xx) + np.mean(k_yy) - 2 * np.mean(k_xy)
            mmds.append(mmd)
        return np.mean(mmds)
    
    @staticmethod
    def psnr_score(real_img, gen_img):
        """
        Compute the Peak Signal-to-Noise Ratio (PSNR) between two images.
        Higher is better; perfect match gives infinity.
        """
        return peak_signal_noise_ratio(real_img, gen_img, data_range=real_img.max() - real_img.min())
    
    @staticmethod
    def ssim_score(real_img, gen_img):
        """
        Compute the Structural Similarity Index (SSIM) between two images.
        Higher is better, with a maximum of 1.0 for a perfect match.
        """
        real_gray = np.mean(real_img, axis=2)
        gen_gray = np.mean(gen_img, axis=2)
        score, _ = structural_similarity(real_gray, gen_gray, full=True)
        return score

    @staticmethod
    def ms_ssim_score(real_img, gen_img):
        """
        Compute the Multi-Scale Structural Similarity (MS-SSIM) between two images using PyTorch.
        Higher is better, with a maximum of 1.0 for a perfect match.
        """
        return ms_ssim_score_pytorch(real_img, gen_img)

# --------------------------
# Main Simulation Script
# --------------------------

# Define the folder containing the image triples.
folder_path = '/media/irfan/data/Download/example_epoch1000'  # Replace with your actual folder path

# Use glob to find all input images (files ending with '_input.png').
input_files = glob.glob(os.path.join(folder_path, '*_input.png'))
input_files = sorted(input_files)

# Set a target size for consistency.
target_size = (256, 256)  # Modify as needed

# Initialize lists to store image-based and feature-based metrics.
psnr_list = []
ssim_list = []
ms_ssim_list = []
mse_list = []   # backup metric: Mean Squared Error
mae_list = []   # backup metric: Mean Absolute Error

label_features_list = []  # Using label image as the ground-truth (real) features
gen_features_list = []    # Features extracted from the generated image
gen_probs_list = []       # Simulated classifier probabilities

# Loop through each sample based on the input image file.
for input_file in input_files:
    # Get the common prefix by removing '_input.png'
    prefix = os.path.basename(input_file).replace('_input.png', '')
    # Construct the corresponding filenames.
    gen_file = os.path.join(folder_path, prefix + '_generated_fake_B.png')
    label_file = os.path.join(folder_path, prefix + '_label.png')
    
    # Check if both generated and label files exist.
    if not os.path.exists(gen_file) or not os.path.exists(label_file):
        continue
    
    # Load images.
    gen_img = load_image(gen_file, target_size=target_size)
    label_img = load_image(label_file, target_size=target_size)
    
    # Compute image quality metrics between generated and label images.
    psnr_val = SupervisedGANMetricsEvaluator.psnr_score(label_img, gen_img)
    ssim_val = SupervisedGANMetricsEvaluator.ssim_score(label_img, gen_img)
    ms_ssim_val = SupervisedGANMetricsEvaluator.ms_ssim_score(label_img, gen_img)
    mse_val = mse_score(label_img, gen_img)
    mae_val = mae_score(label_img, gen_img)
    
    psnr_list.append(psnr_val)
    ssim_list.append(ssim_val)
    ms_ssim_list.append(ms_ssim_val)
    mse_list.append(mse_val)
    mae_list.append(mae_val)
    
    # Extract dummy features from label and generated images.
    label_feat = dummy_feature_extraction(label_img)
    gen_feat = dummy_feature_extraction(gen_img)
    label_features_list.append(label_feat)
    gen_features_list.append(gen_feat)
    
    # Simulate classifier probabilities for the generated image.
    prob = dummy_classifier_probs(gen_img, num_classes=10, bias=None)
    gen_probs_list.append(prob)

# Convert lists to numpy arrays.
label_features = np.array(label_features_list)
gen_features = np.array(gen_features_list)
gen_probs = np.array(gen_probs_list)

# Initialize the evaluator using label features as "real" features.
evaluator = SupervisedGANMetricsEvaluator(label_features, gen_features, gen_probs)

# Compute feature-based metrics.
is_score = evaluator.inception_score()
fid = evaluator.fid_score()
kid = evaluator.kid_score(polynomial_kernel)

# Compute average image quality and backup metrics.
avg_psnr = np.mean(psnr_list) if psnr_list else float('nan')
avg_ssim = np.mean(ssim_list) if ssim_list else float('nan')
avg_ms_ssim = np.mean(ms_ssim_list) if ms_ssim_list else float('nan')
avg_mse = np.mean(mse_list) if mse_list else float('nan')
avg_mae = np.mean(mae_list) if mae_list else float('nan')

# Output the computed metrics.
print("Number of samples processed:", len(psnr_list))
print("Inception Score (higher is better):", is_score)
print("FID Score (lower is better, ideal=0):", fid)
print("KID Score (lower is better, ideal=0):", kid)
print("Average PSNR (higher is better):", avg_psnr)
print("Average SSIM (max=1.0):", avg_ssim)
print("Average MS-SSIM (max=1.0):", avg_ms_ssim)
print("Average MSE (lower is better):", avg_mse)
print("Average MAE (lower is better):", avg_mae)


Number of samples processed: 4
Inception Score (higher is better): 1.6691280749367483
FID Score (lower is better, ideal=0): 6.143022010332744
KID Score (lower is better, ideal=0): 44072681.86723969
Average PSNR (higher is better): 23.85274452069749
Average SSIM (max=1.0): 0.902271412650104
Average MS-SSIM (max=1.0): 0.8673726916313171
Average MSE (lower is better): 0.004246755050234836
Average MAE (lower is better): 0.05111410602245456


In [10]:
import os
import glob
import numpy as np
from PIL import Image
from scipy.linalg import sqrtm
from scipy.stats import entropy
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
import torch
from pytorch_msssim import ms_ssim

# ---------------
# Helper Functions
# ---------------

def load_image(image_path, target_size=None):
    """
    Load an image and optionally resize it.
    Returns the image as a numpy array normalized to [0,1].
    """
    img = Image.open(image_path).convert('RGB')
    if target_size is not None:
        img = img.resize(target_size)
    return np.array(img) / 255.0

def dummy_feature_extraction(image, feature_dim=2048):
    """
    Dummy feature extraction by flattening the image.
    In practice, replace with a pretrained network (e.g., Inception).
    """
    features = image.flatten()
    if features.shape[0] < feature_dim:
        pad_width = feature_dim - features.shape[0]
        features = np.pad(features, (0, pad_width), mode='constant')
    else:
        features = features[:feature_dim]
    return features

def dummy_classifier_probs(image, num_classes=10, bias=None):
    """
    Simulate classifier probabilities using a Dirichlet distribution.
    In a real-world setting, these would be the softmax outputs of a trained classifier.
    Optionally, bias the distribution toward a particular class.
    """
    alpha = np.ones(num_classes) * 0.5
    if bias is not None:
        alpha[bias] = 5.0
    return np.random.dirichlet(alpha)

def polynomial_kernel(X, Y, degree=3, coef0=1):
    """
    A simple polynomial kernel used in the KID computation.
    """
    return (X.dot(Y.T) + coef0) ** degree

def mse_score(img1, img2):
    """
    Compute the Mean Squared Error (MSE) between two images.
    Lower values indicate higher similarity.
    """
    return np.mean((img1 - img2) ** 2)

def mae_score(img1, img2):
    """
    Compute the Mean Absolute Error (MAE) between two images.
    Lower values indicate higher similarity.
    """
    return np.mean(np.abs(img1 - img2))

def ms_ssim_score_pytorch(real_img, gen_img):
    """
    Compute the Multi-Scale Structural Similarity (MS-SSIM) between two images using PyTorch.
    Both images should be NumPy arrays with values in [0, 1] and shape (H, W, C).
    The function converts them to PyTorch tensors with shape (1, C, H, W).
    """
    # Convert images from NumPy to torch tensor and rearrange dimensions to (B, C, H, W)
    real_tensor = torch.tensor(real_img.transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0)
    gen_tensor = torch.tensor(gen_img.transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0)
    # Compute MS-SSIM; returns a tensor of shape (1,)
    msssim_val = ms_ssim(gen_tensor, real_tensor, data_range=1.0)
    return msssim_val.item()

# --------------------------
# Evaluation Class Definition
# --------------------------

class SupervisedGANMetricsEvaluator:
    def __init__(self, real_features, gen_features, gen_probs):
        """
        real_features: np.array of shape (N, D) for label images.
        gen_features: np.array of shape (N, D) for generated images.
        gen_probs: np.array of shape (N, C) simulated classifier probabilities.
        """
        self.real_features = real_features
        self.gen_features = gen_features
        self.gen_probs = gen_probs

    def inception_score(self):
        """
        Compute the Inception Score (IS). Higher is better.
        """
        p_y = np.mean(self.gen_probs, axis=0)
        kl_divs = [entropy(p, p_y) for p in self.gen_probs]
        return np.exp(np.mean(kl_divs))
    
    def fid_score(self):
        """
        Compute the Fréchet Inception Distance (FID). Lower is better; ideal is 0.
        """
        mu_r = np.mean(self.real_features, axis=0)
        mu_g = np.mean(self.gen_features, axis=0)
        cov_r = np.cov(self.real_features, rowvar=False)
        cov_g = np.cov(self.gen_features, rowvar=False)
        
        diff = mu_r - mu_g
        diff_squared = diff.dot(diff)
        covmean, _ = sqrtm(cov_r.dot(cov_g), disp=False)
        if np.iscomplexobj(covmean):
            covmean = covmean.real
        return diff_squared + np.trace(cov_r + cov_g - 2 * covmean)
    
    def kid_score(self, kernel_func, n_subsets=10, subset_size=5):
        """
        Compute the Kernel Inception Distance (KID). Lower is better; ideal is 0.
        """
        mmds = []
        n_real = self.real_features.shape[0]
        n_gen = self.gen_features.shape[0]
        for _ in range(n_subsets):
            idx_real = np.random.choice(n_real, subset_size, replace=True)
            idx_gen = np.random.choice(n_gen, subset_size, replace=True)
            X = self.real_features[idx_real]
            Y = self.gen_features[idx_gen]
            
            k_xx = kernel_func(X, X)
            k_yy = kernel_func(Y, Y)
            k_xy = kernel_func(X, Y)
            
            mmd = np.mean(k_xx) + np.mean(k_yy) - 2 * np.mean(k_xy)
            mmds.append(mmd)
        return np.mean(mmds)
    
    @staticmethod
    def psnr_score(real_img, gen_img):
        """
        Compute the Peak Signal-to-Noise Ratio (PSNR) between two images.
        Higher is better; perfect match gives infinity.
        """
        return peak_signal_noise_ratio(real_img, gen_img, data_range=real_img.max() - real_img.min())
    
    @staticmethod
    def ssim_score(real_img, gen_img):
        """
        Compute the Structural Similarity Index (SSIM) between two images.
        Higher is better, with a maximum of 1.0 for a perfect match.
        """
        real_gray = np.mean(real_img, axis=2)
        gen_gray = np.mean(gen_img, axis=2)
        score, _ = structural_similarity(real_gray, gen_gray, full=True)
        return score

    @staticmethod
    def ms_ssim_score(real_img, gen_img):
        """
        Compute the Multi-Scale Structural Similarity (MS-SSIM) between two images using PyTorch.
        Higher is better, with a maximum of 1.0 for a perfect match.
        """
        return ms_ssim_score_pytorch(real_img, gen_img)

# --------------------------
# Main Simulation Script
# --------------------------

# Define the folder containing the image triples.
folder_path = '/media/irfan/data/Download/example_epoch1000'  # Replace with your actual folder path

# Use glob to find all input images (files ending with '_input.png').
input_files = glob.glob(os.path.join(folder_path, '*_input.png'))
input_files = sorted(input_files)

# Set a target size for consistency.
target_size = (256, 256)  # Modify as needed

# Initialize lists to store image-based and feature-based metrics.
psnr_list = []
ssim_list = []
ms_ssim_list = []
mse_list = []   # backup metric: Mean Squared Error
mae_list = []   # backup metric: Mean Absolute Error

label_features_list = []  # Using label image as the ground-truth (real) features
gen_features_list = []    # Features extracted from the generated image
gen_probs_list = []       # Simulated classifier probabilities

# Loop through each sample based on the input image file.
for input_file in input_files:
    # Get the common prefix by removing '_input.png'
    prefix = os.path.basename(input_file).replace('_input.png', '')
    # Construct the corresponding filenames.
    gen_file = os.path.join(folder_path, prefix + '_label.png')
    label_file = os.path.join(folder_path, prefix + '_label.png')
    
    # Check if both generated and label files exist.
    if not os.path.exists(gen_file) or not os.path.exists(label_file):
        continue
    
    # Load images.
    gen_img = load_image(gen_file, target_size=target_size)
    label_img = load_image(label_file, target_size=target_size)
    
    # Compute image quality metrics between generated and label images.
    psnr_val = SupervisedGANMetricsEvaluator.psnr_score(label_img, gen_img)
    ssim_val = SupervisedGANMetricsEvaluator.ssim_score(label_img, gen_img)
    ms_ssim_val = SupervisedGANMetricsEvaluator.ms_ssim_score(label_img, gen_img)
    mse_val = mse_score(label_img, gen_img)
    mae_val = mae_score(label_img, gen_img)
    
    psnr_list.append(psnr_val)
    ssim_list.append(ssim_val)
    ms_ssim_list.append(ms_ssim_val)
    mse_list.append(mse_val)
    mae_list.append(mae_val)
    
    # Extract dummy features from label and generated images.
    label_feat = dummy_feature_extraction(label_img)
    gen_feat = dummy_feature_extraction(gen_img)
    label_features_list.append(label_feat)
    gen_features_list.append(gen_feat)
    
    # Simulate classifier probabilities for the generated image.
    prob = dummy_classifier_probs(gen_img, num_classes=10, bias=None)
    gen_probs_list.append(prob)

# Convert lists to numpy arrays.
label_features = np.array(label_features_list)
gen_features = np.array(gen_features_list)
gen_probs = np.array(gen_probs_list)

# Initialize the evaluator using label features as "real" features.
evaluator = SupervisedGANMetricsEvaluator(label_features, gen_features, gen_probs)

# Compute feature-based metrics.
is_score = evaluator.inception_score()
fid = evaluator.fid_score()
kid = evaluator.kid_score(polynomial_kernel)

# Compute average image quality and backup metrics.
avg_psnr = np.mean(psnr_list) if psnr_list else float('nan')
avg_ssim = np.mean(ssim_list) if ssim_list else float('nan')
avg_ms_ssim = np.mean(ms_ssim_list) if ms_ssim_list else float('nan')
avg_mse = np.mean(mse_list) if mse_list else float('nan')
avg_mae = np.mean(mae_list) if mae_list else float('nan')

# Output the computed metrics.
print("Number of samples processed:", len(psnr_list))
print("Inception Score (higher is better):", is_score)
print("FID Score (lower is better, ideal=0):", fid)
print("KID Score (lower is better, ideal=0):", kid)
print("Average PSNR (higher is better):", avg_psnr)
print("Average SSIM (max=1.0):", avg_ssim)
print("Average MS-SSIM (max=1.0):", avg_ms_ssim)
print("Average MSE (lower is better):", avg_mse)
print("Average MAE (lower is better):", avg_mae)


  return 10 * np.log10((data_range ** 2) / err)


Number of samples processed: 4
Inception Score (higher is better): 1.3661684420359368
FID Score (lower is better, ideal=0): -5.507491678952205e-05
KID Score (lower is better, ideal=0): 17131311.990887284
Average PSNR (higher is better): inf
Average SSIM (max=1.0): 1.0
Average MS-SSIM (max=1.0): 1.0
Average MSE (lower is better): 0.0
Average MAE (lower is better): 0.0
