# New Matrix Code - Baseline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Work2024PhD/NewMetricsDatasets

/content/drive/MyDrive/Work2024PhD/NewMetricsDatasets


In [None]:
# !unzip Faces_Balance_Imbalance_Datasets

# Baseline - Balance and Imbalance datasets

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm

# Saliency Function (Provided)
def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    x = saliency_bbox(image.img_to_array(img).astype('uint8'))

    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity


from sklearn.metrics.pairwise import euclidean_distances

def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities

def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0

def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

# Load datasets and compute diversity and fairness
print(os.listdir('Faces_Balance_Imbalance_Datasets'))
for run in range(5):
  for main_folder in os.listdir('Faces_Balance_Imbalance_Datasets'):
      if '.' not in main_folder:
          combined_metrics_results = []
          dataset_names = []
          diversity = []
          fairness = []
          for dataset_name in os.listdir('Faces_Balance_Imbalance_Datasets/' + main_folder):
              if '.' not in dataset_name:
                  print(main_folder, ' = ', dataset_name)
                  dataset_m = [f for f in glob(f'Faces_Balance_Imbalance_Datasets/' + main_folder + '/' + dataset_name + '/Male/*')]
                  dataset_f = [f for f in glob(f'Faces_Balance_Imbalance_Datasets/' + main_folder + '/' + dataset_name + '/Female/*')]
                  group_m = [0] * len(dataset_m)
                  group_f = [1] * len(dataset_f)
                  dataset = dataset_m + dataset_f
                  groups = group_m + group_f
                  combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(dataset, groups)
                  combined_metrics_results.append(combined_metric)
                  dataset_names.append(dataset_name)
                  diversity.append(within_group_avg_diversity)
                  fairness.append(inter_group_avg_diversity)


          result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                                  columns=['Dataset', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
          result_df.to_csv('/content/drive/MyDrive/Work2024PhD/NewMetricsDatasets/' + main_folder + "_all_dataset"+str(run)+".csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

In [None]:
import pandas as pd

 diversity_fairness_metrics.tex                 'Gender Imbalance_all_dataset0.csv'
 [0m[01;34mFaces_Balance_Imbalance_Datasets[0m/              'Gender Imbalance_all_dataset1.csv'
 Faces_Balance_Imbalance_Datasets.zip           'Gender Imbalance_all_dataset2.csv'
'Gender Balance_all_dataset0.csv'               'Gender Imbalance_all_dataset3.csv'
'Gender Balance_all_dataset1.csv'               'Gender Imbalance_all_dataset4.csv'
'Gender Balance_all_dataset2.csv'               'Gender Imbalance_all_dataset_augmentation0.csv'
'Gender Balance_all_dataset3.csv'               'Gender Imbalance_all_dataset_augmentation1.csv'
'Gender Balance_all_dataset4.csv'               'Gender Imbalance_all_dataset_augmentation2.csv'
'Gender Balance_all_dataset_augmentation0.csv'  'Gender Imbalance_all_dataset_augmentation3.csv'
'Gender Balance_all_dataset_augmentation1.csv'  'Gender Imbalance_all_dataset_augmentation4.csv'
'Gender Balance_all_dataset_augmentation2.csv'   [01;34m__MACOSX[0m/
'Gender 

In [None]:
import os
import pandas as pd
import numpy as np
from glob import glob

# Load datasets and compute diversity and fairness
print(os.listdir('Faces_Balance_Imbalance_Datasets'))

# Initialize lists to store DataFrames for balance and imbalance datasets
balanced_results = []
imbalanced_results = []

# Read the existing CSV files
for run in range(3):  # Adjusted to read 5 runs
    for main_folder in os.listdir('Faces_Balance_Imbalance_Datasets'):
        if '.' not in main_folder:
            # Read the CSV file
            csv_file_path = f'{main_folder}_all_dataset{run}.csv'
            if os.path.exists(csv_file_path):
                result_df = pd.read_csv(csv_file_path)

                # Check if the dataset is balanced or imbalanced based on folder naming
                if 'Balance' in csv_file_path:
                    balanced_results.append(result_df)
                elif 'Imbalance' in csv_file_path:
                    imbalanced_results.append(result_df)

# Combine results across all runs into single DataFrames
balanced_df = pd.concat(balanced_results, ignore_index=True)
imbalanced_df = pd.concat(imbalanced_results, ignore_index=True)

# Group by Dataset and calculate mean and std for the metrics for balanced datasets
balanced_summary_df = balanced_df.groupby('Dataset').agg({
    'WithinGroupDiversity': ['mean', 'std'],
    'InterGroupDiversity': ['mean', 'std'],
    'CombinedFairnessDiversityScore': ['mean', 'std']
}).reset_index()

# Group by Dataset and calculate mean and std for the metrics for imbalanced datasets
imbalanced_summary_df = imbalanced_df.groupby('Dataset').agg({
    'WithinGroupDiversity': ['mean', 'std'],
    'InterGroupDiversity': ['mean', 'std'],
    'CombinedFairnessDiversityScore': ['mean', 'std']
}).reset_index()

# Create a new DataFrame to hold the formatted results
final_summary_df = pd.DataFrame()

# Format mean ± std for balanced datasets
final_summary_df['Dataset'] = balanced_summary_df['Dataset']
final_summary_df['WithinGroupDiversity (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity']['mean']:.2f} \pm {row['WithinGroupDiversity']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['InterGroupDiversity']['mean']:.2f} \pm {row['InterGroupDiversity']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore']['std']:.2f}", axis=1
)

# Prepare to format imbalanced datasets
imbalanced_formatted = []

# Append imbalanced datasets formatted results
for idx, row in imbalanced_summary_df.iterrows():
    imbalanced_formatted.append({
        'WithinGroupDiversity (Imbalanced)': f"{row['WithinGroupDiversity']['mean']:.2f} \pm {row['WithinGroupDiversity']['std']:.2f}",
        'InterGroupDiversity (Imbalanced)': f"{row['InterGroupDiversity']['mean']:.2f} \pm {row['InterGroupDiversity']['std']:.2f}",
        'CombinedFairnessDiversityScore (Imbalanced)': f"{row['CombinedFairnessDiversityScore']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore']['std']:.2f}",
    })

# Convert the imbalanced formatted list to DataFrame
imbalanced_df_formatted = pd.DataFrame(imbalanced_formatted)

# Concatenate balanced and imbalanced results without causing extra rows
final_summary_df = pd.concat([final_summary_df.reset_index(drop=True),
                               imbalanced_df_formatted.reset_index(drop=True)], axis=1)

# Create LaTeX table format with column spans
latex_table = r"""
\begin{table}
\caption{Summary of Diversity and Fairness Metrics for Balanced and Imbalanced Datasets}
\label{tab:diversity_fairness_metrics}
\begin{tabular}{lccc|ccc}
\toprule
Dataset & \multicolumn{3}{c|}{Balanced} & \multicolumn{3}{c}{Imbalanced} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & WithinGroupDiversity & InterGroupDiversity & CombinedFairnessDiversityScore & WithinGroupDiversity & InterGroupDiversity & CombinedFairnessDiversityScore \\
\midrule
"""

# Append data rows
for index, row in final_summary_df.iterrows():
    latex_table += f"{row['Dataset']} & {row['WithinGroupDiversity (Balanced)']} & {row['InterGroupDiversity (Balanced)']} & {row['CombinedFairnessDiversityScore (Balanced)']} & {row['WithinGroupDiversity (Imbalanced)']} & {row['InterGroupDiversity (Imbalanced)']} & {row['CombinedFairnessDiversityScore (Imbalanced)']} \\\\\n"

latex_table += r"""\bottomrule
\end{tabular}
\end{table}
"""

# Save the LaTeX table to a .tex file
# with open('diversity_fairness_metrics.tex', 'w') as f:
#     f.write(latex_table)

print("LaTeX table saved successfully.")
print(latex_table)

# With Augmentation- Balance and Imbalance datasets

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm
import random
import numpy as np
import torch
from PIL import Image
import cv2
from imgaug import augmenters as iaa
from PIL import Image, ImageEnhance, ImageOps
import albumentations as A

import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
from PIL import Image, ImageEnhance, ImageOps

def apply_random_augmentation(image, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to an image.

    Args:
        image (PIL.Image.Image): Input image.
        num_transforms (int): Number of random transformations to apply.
        magnitude (int): Magnitude of the transformations.

    Returns:
        PIL.Image.Image: Augmented image.
    """
    augmentations = [
        ('flip_lr', lambda img: img.transpose(Image.FLIP_LEFT_RIGHT)),
        ('flip_ud', lambda img: img.transpose(Image.FLIP_TOP_BOTTOM)),
        ('rotate', lambda img: img.rotate(random.uniform(-magnitude*3, magnitude*3))),
        ('brightness', lambda img: ImageEnhance.Brightness(img).enhance(random.uniform(0.5, 1.5))),
        ('contrast', lambda img: ImageEnhance.Contrast(img).enhance(random.uniform(0.5, 1.5))),
        ('color', lambda img: ImageEnhance.Color(img).enhance(random.uniform(0.5, 1.5))),
        ('sharpness', lambda img: ImageEnhance.Sharpness(img).enhance(random.uniform(0.5, 2.0))),
        ('autocontrast', lambda img: ImageOps.autocontrast(img)),
        ('solarize', lambda img: ImageOps.solarize(img, threshold=random.uniform(128 - magnitude * 10, 128))),
        ('invert', lambda img: ImageOps.invert(img)),
    ]

    # Randomly select augmentations
    selected_augmentations = random.sample(augmentations, num_transforms)

    # Apply augmentations
    for name, aug_fn in selected_augmentations:
        image = aug_fn(image)

    return image

def augment_batch(images, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to a batch of images.

    Args:
        images (numpy.ndarray): Array of images (H, W, C).
        num_transforms (int): Number of random transformations to apply per image.
        magnitude (int): Magnitude of the transformations.
    Returns:
        numpy.ndarray: Array of augmented images.
    """
    augmented_images = []
    for img_array in images:
        img_pil = Image.fromarray(img_array.astype('uint8'))  # Convert numpy array to PIL Image
        augmented_img_pil = apply_random_augmentation(img_pil, num_transforms, magnitude)
        augmented_images.append(np.array(augmented_img_pil))  # Convert back to numpy array

    return np.array(augmented_images)

# Example usage:
# Assume images is a numpy array of shape (batch_size, height, width, channels)
# augmented_images = augment_batch(images, num_transforms=3, magnitude=7)


def KeepOriginalAug(img):


  ori_h1,ori_w1,c = img.shape
  img  = cv2.resize(img, (512, 512))
  h1,w1,c = img.shape
  mask_zero = np.zeros((h1,w1, c), img.dtype)


  beta=1
  lam = np.random.beta(beta, beta)
  x1, y1,x2,y2 = saliency_bbox_return(img, lam)


  # if no saliency detected, then pick the next image
  if x1==x2 or y1==y2:
      return img



  mask_zero[x1: x2, y1: y2, :] = img[x1: x2, y1: y2,:]


  sizes= [(w1-((w1-y2)+(y2-y1)), h1-((h1-x2)+(x2-x1))),
  (w1-((w1-y2)+(y2-y1)), x2-x1),
  (w1-((w1-y2)+(y2-y1)), h1-x2),
  (y2-y1, h1-((h1-x2)+(x2-x1))),
  (y2-y1, h1-x2),
  (w1-y2, h1-((h1-x2)+(x2-x1))),
  (w1-y2, x2-x1),
  (w1-y2, h1-x2)
  ]
  areas=[]
  for sz in sizes:
      areas.append(sz[0]*sz[1])

  areas= np.array(areas)
  sizes = np.array(sizes)


  big  = 512
  #  size with its bound box (x1,x2,y1,y2)
  boxes = [[0,x1,0,y1],
              [x1,x2,0,y1],
              [x2,big,0,y1],
              [0,x1,y1,y2],
              [x2,big,y1,y2],
              [0,x1,y2,big],
              [x1,x2,y2,big],
              [x2,big,y2,big]
              ]
  boxes = np.array(boxes)



  idx = np.random.choice(list(range(len(areas[areas!=0]))))
  h2, w2 = sizes[areas!=0][idx]
  x12, x22, y12,y22= boxes[areas!=0][idx]

  resized  = cv2.resize(mask_zero[x1: x2, y1: y2, : ], (h2,w2))

  img = augment_batch(img.copy())
  img = np.reshape(img, (img.shape[0],img.shape[1],img.shape[2]))
  resized = augment_batch(resized.copy())
  # print(resized.shape)
  # resized = np.reshape(resized, (resized.shape[:-1]))
  img[x12: x22, y12: y22] = resized
  # augment(resized.copy())
  # self.tensor(self.auto(self.pil(resized)))

  return cv2.resize(img, (ori_h1,ori_w1))

def saliency_bbox_return(img, lam):
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # initialize OpenCV's static fine grained saliency detector and
    # compute the saliency map
    temp_img = img.copy()

    saliency = cv2.saliency.StaticSaliencyFineGrained_create()

    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")

    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2





# def augment(images):
#     # Input to `augment()` is a TensorFlow tensor which
#     # is not supported by `imgaug`. This is why we first
#     # convert it to its `numpy` variant.
#     rand_aug = iaa.RandAugment(n=3, m=7)
#     images = np.reshape(images, (1, images.shape[0], images.shape[1], images.shape[2]))

#     return rand_aug(images=images)

def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    r= np.random.uniform()
    if r>0.5:
      x= KeepOriginalAug(image.img_to_array(img).astype('uint8'))
    else:
      x= saliency_bbox(image.img_to_array(img).astype('uint8'))
    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities

def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0
# def calculate_inter_group_diversity(groups, saliency_features):
#     unique_groups = np.unique(groups)
#     inter_group_dists = []

#     # Get the number of samples per group to handle imbalance
#     group_sizes = {group: np.sum(groups == group) for group in unique_groups}

#     for i in range(len(unique_groups)):
#         for j in range(i + 1, len(unique_groups)):
#             group_i_indices = np.where(groups == unique_groups[i])[0]
#             group_j_indices = np.where(groups == unique_groups[j])[0]
#             group_i_features = saliency_features[group_i_indices]
#             group_j_features = saliency_features[group_j_indices]
#             pairwise_distances = euclidean_distances(group_i_features, group_j_features)

#             # Normalize distances between 0 and 1
#             norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
#             weighted_dist = np.mean(norm_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
#             inter_group_dists.append(weighted_dist)

#     return np.sum(inter_group_dists) / (sum(group_sizes.values()) ** 2 - sum(group_sizes.values())) if inter_group_dists else 0.0

def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)


# Load datasets and compute diversity and fairness
print(os.listdir('Faces_Balance_Imbalance_Datasets'))
for run in range(5):

  for main_folder in os.listdir('Faces_Balance_Imbalance_Datasets'):
      if '.' not in main_folder:
          combined_metrics_results = []
          dataset_names = []
          diversity = []
          fairness = []
          for dataset_name in os.listdir('Faces_Balance_Imbalance_Datasets/' + main_folder):
              if '.' not in dataset_name:
                  print(main_folder, ' = ', dataset_name)
                  dataset_m = [f for f in glob(f'Faces_Balance_Imbalance_Datasets/' + main_folder + '/' + dataset_name + '/Male/*')]
                  dataset_f = [f for f in glob(f'Faces_Balance_Imbalance_Datasets/' + main_folder + '/' + dataset_name + '/Female/*')]
                  group_m = [0] * len(dataset_m)
                  group_f = [1] * len(dataset_f)
                  dataset = dataset_m + dataset_f
                  groups = group_m + group_f
                  combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(dataset, groups)
                  combined_metrics_results.append(combined_metric)
                  dataset_names.append(dataset_name)
                  diversity.append(within_group_avg_diversity)
                  fairness.append(inter_group_avg_diversity)

          result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                                  columns=['Dataset', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
          result_df.to_csv('/content/drive/MyDrive/Work2024PhD/NewMetricsDatasets/' + main_folder + "_all_dataset_augmentation"+str(run)+".csv")


['.DS_Store', 'Gender Balance', 'Gender Imbalance']
Gender Balance  =  DiverseDataset
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 968ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

 [0m[01;34mFaces_Balance_Imbalance_Datasets[0m/              'Gender Imbalance_all_dataset1.csv'
 Faces_Balance_Imbalance_Datasets.zip           'Gender Imbalance_all_dataset2.csv'
'Gender Balance_all_dataset1.csv'               'Gender Imbalance_all_dataset3.csv'
'Gender Balance_all_dataset2.csv'               'Gender Imbalance_all_dataset4.csv'
'Gender Balance_all_dataset3.csv'               'Gender Imbalance_all_dataset_augmentation1.csv'
'Gender Balance_all_dataset4.csv'               'Gender Imbalance_all_dataset_augmentation2.csv'
'Gender Balance_all_dataset_augmentation1.csv'  'Gender Imbalance_all_dataset_augmentation3.csv'
'Gender Balance_all_dataset_augmentation2.csv'  'Gender Imbalance_all_dataset_augmentation4.csv'
'Gender Balance_all_dataset_augmentation3.csv'  'Gender Imbalance_all_dataset_augmentation.csv'
'Gender Balance_all_dataset_augmentation4.csv'  'Gender Imbalance_all_dataset.csv'
'Gender Balance_all_dataset_augmentation.csv'    [01;34m__MACOSX[0m/
'Gender Ba

In [None]:
import os
import pandas as pd
import numpy as np
from glob import glob

# Load datasets and compute diversity and fairness
print(os.listdir('Faces_Balance_Imbalance_Datasets'))

# Initialize lists to store DataFrames for balance and imbalance datasets
balanced_results = []
imbalanced_results = []

# Read the existing CSV files
for run in range(3):  # Adjusted to read 5 runs
    for main_folder in os.listdir('Faces_Balance_Imbalance_Datasets'):
        if '.' not in main_folder:
            # Read the CSV file
            csv_file_path = f'{main_folder}_all_dataset_augmentation{run}.csv'
            if os.path.exists(csv_file_path):
                result_df = pd.read_csv(csv_file_path)

                # Check if the dataset is balanced or imbalanced based on folder naming
                if 'Balance' in csv_file_path:
                    balanced_results.append(result_df)
                elif 'Imbalance' in csv_file_path:
                    imbalanced_results.append(result_df)

# Combine results across all runs into single DataFrames
balanced_df = pd.concat(balanced_results, ignore_index=True)
imbalanced_df = pd.concat(imbalanced_results, ignore_index=True)

# Group by Dataset and calculate mean and std for the metrics for balanced datasets
balanced_summary_df = balanced_df.groupby('Dataset').agg({
    'WithinGroupDiversity': ['mean', 'std'],
    'InterGroupDiversity': ['mean', 'std'],
    'CombinedFairnessDiversityScore': ['mean', 'std']
}).reset_index()

# Group by Dataset and calculate mean and std for the metrics for imbalanced datasets
imbalanced_summary_df = imbalanced_df.groupby('Dataset').agg({
    'WithinGroupDiversity': ['mean', 'std'],
    'InterGroupDiversity': ['mean', 'std'],
    'CombinedFairnessDiversityScore': ['mean', 'std']
}).reset_index()

# Create a new DataFrame to hold the formatted results
final_summary_df = pd.DataFrame()

# Format mean ± std for balanced datasets
final_summary_df['Dataset'] = balanced_summary_df['Dataset']
final_summary_df['WithinGroupDiversity (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity']['mean']:.2f} \pm {row['WithinGroupDiversity']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['InterGroupDiversity']['mean']:.2f} \pm {row['InterGroupDiversity']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Balanced)'] = balanced_summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore']['std']:.2f}", axis=1
)

# Prepare to format imbalanced datasets
imbalanced_formatted = []

# Append imbalanced datasets formatted results
for idx, row in imbalanced_summary_df.iterrows():
    imbalanced_formatted.append({
        'WithinGroupDiversity (Imbalanced)': f"{row['WithinGroupDiversity']['mean']:.2f} \pm {row['WithinGroupDiversity']['std']:.2f}",
        'InterGroupDiversity (Imbalanced)': f"{row['InterGroupDiversity']['mean']:.2f} \pm {row['InterGroupDiversity']['std']:.2f}",
        'CombinedFairnessDiversityScore (Imbalanced)': f"{row['CombinedFairnessDiversityScore']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore']['std']:.2f}",
    })

# Convert the imbalanced formatted list to DataFrame
imbalanced_df_formatted = pd.DataFrame(imbalanced_formatted)

# Concatenate balanced and imbalanced results without causing extra rows
final_summary_df = pd.concat([final_summary_df.reset_index(drop=True),
                               imbalanced_df_formatted.reset_index(drop=True)], axis=1)

# Create LaTeX table format with column spans
latex_table = r"""
\begin{table}
\caption{Summary of Diversity and Fairness Metrics for Balanced and Imbalanced Datasets}
\label{tab:diversity_fairness_metrics}
\begin{tabular}{lccc|ccc}
\toprule
Dataset & \multicolumn{3}{c|}{Balanced} & \multicolumn{3}{c}{Imbalanced} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & WithinGroupDiversity & InterGroupDiversity & CombinedFairnessDiversityScore & WithinGroupDiversity & InterGroupDiversity & CombinedFairnessDiversityScore \\
\midrule
"""

# Append data rows
for index, row in final_summary_df.iterrows():
    latex_table += f"{row['Dataset']} & {row['WithinGroupDiversity (Balanced)']} & {row['InterGroupDiversity (Balanced)']} & {row['CombinedFairnessDiversityScore (Balanced)']} & {row['WithinGroupDiversity (Imbalanced)']} & {row['InterGroupDiversity (Imbalanced)']} & {row['CombinedFairnessDiversityScore (Imbalanced)']} \\\\\n"

latex_table += r"""\bottomrule
\end{tabular}
\end{table}
"""

# Save the LaTeX table to a .tex file
# with open('diversity_fairness_metrics.tex', 'w') as f:
#     f.write(latex_table)

print("LaTeX table saved successfully.")
print(latex_table)

In [None]:
import os
csv_path = 'Gender Imbalance_all_dataset_augmentation4.csv'
print(os.path.exists(csv_file_path), csv_path)
print(csv_path)

False Gender Imbalance_all_dataset_augmentation4.csv
Gender Imbalance_all_dataset_augmentation4.csv


In [None]:
cd /content/drive/MyDrive/Work2024PhD/Face Dataset/

/content/drive/MyDrive/Work2024PhD/Face Dataset


#  Baseline - Langague-Location to Gender!

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm

# Saliency Function (Provided)
def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    x = saliency_bbox(image.img_to_array(img).astype('uint8'))

    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities



def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0


def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)


# Load datasets and compute diversity and fairness
print(os.listdir('./FacesCropped'))
# /content/drive/MyDrive/Work2024PhD/Face Dataset/FacesCropped
for run in range(1,5,1):
  combined_metrics_results = []
  dataset_names = []
  diversity = []
  fairness = []
  for main_folder in os.listdir('FacesCropped'):
      if '.' not in main_folder:


          print(main_folder, ' = ')
          dataset_m = [f for f in glob(f'FacesCropped/' + main_folder  + '/Man/*')]
          dataset_f = [f for f in glob(f'FacesCropped/' + main_folder + '/Woman/*')]
          group_m = [0] * len(dataset_m)
          group_f = [1] * len(dataset_f)
          dataset = dataset_m + dataset_f
          groups = group_m + group_f
          combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(dataset, groups)
          combined_metrics_results.append(combined_metric)
          dataset_names.append(main_folder)
          diversity.append(within_group_avg_diversity)
          fairness.append(inter_group_avg_diversity)

  result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                            columns=['Dataset', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
  result_df.to_csv('./FacesCropped/' + "Baseline_all_dataset"+str(run)+".csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step

In [None]:
cd /content/drive/MyDrive/Work2024PhD/Face Dataset

/content/drive/MyDrive/Work2024PhD/Face Dataset


In [None]:
import os
import pandas as pd

# Initialize lists to store DataFrames for all runs
results = []

# Loop through each run to read metrics from existing CSVs for Baseline and Augmentation
for run in range(5):
    # Define the path to the CSV file for the current run (Baseline)
    # csv_file_path_baseline = f'./FacesCropped/Baseline_all_dataset{run}.csv'
    # # Define the path to the CSV file for the current run (Augmentation)
    # csv_file_path_aug = f'./FacesCropped/Augmentation_all_dataset{run}.csv'

    # Define the path to the CSV file for the current run (Baseline)
    csv_file_path_baseline = f'./FacesCropped/Augmentation_all_dataset{run}.csv'
    # Define the path to the CSV file for the current run (Augmentation)
    csv_file_path_aug = f'./FacesCropped/Baseline_all_dataset{run}.csv'


    # Read the Baseline CSV file into a DataFrame
    if os.path.exists(csv_file_path_baseline):
        result_df_baseline = pd.read_csv(csv_file_path_baseline)

    # Read the Augmentation CSV file into a DataFrame
    if os.path.exists(csv_file_path_aug):
        result_df_aug = pd.read_csv(csv_file_path_aug)

    # Combine metrics from Baseline and Augmentation
    if os.path.exists(csv_file_path_baseline) and os.path.exists(csv_file_path_aug):
        # Merge the two DataFrames on the 'Dataset' column
        combined_df = pd.merge(result_df_baseline, result_df_aug, on='Dataset', suffixes=('_baseline', '_aug'))

        # Append to the results list
        results.append(combined_df)

# Combine results from all runs into a single DataFrame
all_results_df = pd.concat(results, ignore_index=True)

# Group by Dataset and calculate mean and std for the metrics for both Baseline and Augmentation
summary_df = all_results_df.groupby('Dataset').agg({
    'WithinGroupDiversity_baseline': ['mean', 'std'],
    'InterGroupDiversity_baseline': ['mean', 'std'],
    'CombinedFairnessDiversityScore_baseline': ['mean', 'std'],
    'WithinGroupDiversity_aug': ['mean', 'std'],
    'InterGroupDiversity_aug': ['mean', 'std'],
    'CombinedFairnessDiversityScore_aug': ['mean', 'std']
}).reset_index()

# Create a new DataFrame to hold the formatted results
final_summary_df = pd.DataFrame()

# Format mean ± std for each metric
final_summary_df['Dataset'] = summary_df['Dataset']
final_summary_df['WithinGroupDiversity (Baseline)'] = summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity_baseline']['mean']:.2f} \pm {row['WithinGroupDiversity_baseline']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Baseline)'] = summary_df.apply(
    lambda row: f"{row['InterGroupDiversity_baseline']['mean']:.2f} \pm {row['InterGroupDiversity_baseline']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Baseline)'] = summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore_baseline']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore_baseline']['std']:.2f}", axis=1
)
final_summary_df['WithinGroupDiversity (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity_aug']['mean']:.2f} \pm {row['WithinGroupDiversity_aug']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['InterGroupDiversity_aug']['mean']:.2f} \pm {row['InterGroupDiversity_aug']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore_aug']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore_aug']['std']:.2f}", axis=1
)

# Create LaTeX table format
latex_table = r"""
\begin{table*}
\centering
\caption{Gender Diversity and Fairness Metrics across different Language Location pair query}
\label{tab:diversity_fairness_metrics}
\begin{tabular}{lccc|ccc}
\toprule
Language Location Pair Dataset &
\multicolumn{3}{c|}{Baseline} & \multicolumn{3}{c}{With FaceKeepOriginalAugmentaiton} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ \\
 \midrule
"""

# Append data rows
for index, row in final_summary_df.iterrows():
    latex_table += f"{row['Dataset']} & {row['WithinGroupDiversity (Baseline)']} & {row['InterGroupDiversity (Baseline)']} & {row['CombinedFairnessDiversityScore (Baseline)']} & {row['WithinGroupDiversity (Augmentation)']} & {row['InterGroupDiversity (Augmentation)']} & {row['CombinedFairnessDiversityScore (Augmentation)']} \\\\\n"

latex_table += r"""\bottomrule
\end{tabular}
\end{table*}
"""

# Save the LaTeX table to a .tex file
# with open('diversity_fairness_metrics.tex', 'w') as f:
#     f.write(latex_table)

# print("LaTeX table saved successfully.")
print(latex_table)


\begin{table*}
\centering
\caption{Gender Diversity and Fairness Metrics across different Language Location pair query}
\label{tab:diversity_fairness_metrics}
\begin{tabular}{lccc|ccc}
\toprule
Language Location Pair Dataset &
\multicolumn{3}{c|}{Baseline} & \multicolumn{3}{c}{With FaceKeepOriginalAugmentaiton} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ \\
 \midrule
Arabic-West Asia & North Africa & 0.59 \pm 0.27 & 0.49 \pm 0.16 & 0.54 \pm 0.06 & 0.79 \pm 0.01 & 0.35 \pm 0.00 & 0.57 \pm 0.01 \\
English - North America & 0.59 \pm 0.27 & 0.49 \pm 0.16 & 0.54 \pm 0.06 & 0.81 \pm 0.01 & 0.35 \pm 0.00 & 0.58 \pm 0.01 \\
English-West Europe & 0.59 \pm 0.28 & 0.50 \pm 0.17 & 0.55 \pm 0.06 & 0.80 \pm 0.02 & 0.35 \pm 0.00 & 0.57 \pm 0.01 \\
Hindi-South Asia & 0.59 \pm 0.27 & 0.48 \pm 0.15 & 0.53 \pm 0.06 & 0.80 \pm 0.01 & 0.35 \pm 0.00 & 0.57 \pm 0.00 \\
Indonesian-South

# Augmentation - Langague-Location to Gender!

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm
import random


import numpy as np
import torch
from PIL import Image
import cv2
from imgaug import augmenters as iaa
from PIL import Image, ImageEnhance, ImageOps
import albumentations as A

import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
from PIL import Image, ImageEnhance, ImageOps

def apply_random_augmentation(image, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to an image.

    Args:
        image (PIL.Image.Image): Input image.
        num_transforms (int): Number of random transformations to apply.
        magnitude (int): Magnitude of the transformations.

    Returns:
        PIL.Image.Image: Augmented image.
    """
    augmentations = [
        ('flip_lr', lambda img: img.transpose(Image.FLIP_LEFT_RIGHT)),
        ('flip_ud', lambda img: img.transpose(Image.FLIP_TOP_BOTTOM)),
        ('rotate', lambda img: img.rotate(random.uniform(-magnitude*3, magnitude*3))),
        ('brightness', lambda img: ImageEnhance.Brightness(img).enhance(random.uniform(0.5, 1.5))),
        ('contrast', lambda img: ImageEnhance.Contrast(img).enhance(random.uniform(0.5, 1.5))),
        ('color', lambda img: ImageEnhance.Color(img).enhance(random.uniform(0.5, 1.5))),
        ('sharpness', lambda img: ImageEnhance.Sharpness(img).enhance(random.uniform(0.5, 2.0))),
        ('autocontrast', lambda img: ImageOps.autocontrast(img)),
        ('solarize', lambda img: ImageOps.solarize(img, threshold=random.uniform(128 - magnitude * 10, 128))),
        ('invert', lambda img: ImageOps.invert(img)),
    ]

    # Randomly select augmentations
    selected_augmentations = random.sample(augmentations, num_transforms)

    # Apply augmentations
    for name, aug_fn in selected_augmentations:
        image = aug_fn(image)

    return image

def augment_batch(images, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to a batch of images.

    Args:
        images (numpy.ndarray): Array of images (H, W, C).
        num_transforms (int): Number of random transformations to apply per image.
        magnitude (int): Magnitude of the transformations.

    Returns:
        numpy.ndarray: Array of augmented images.
    """
    augmented_images = []
    for img_array in images:
        img_pil = Image.fromarray(img_array.astype('uint8'))  # Convert numpy array to PIL Image
        augmented_img_pil = apply_random_augmentation(img_pil, num_transforms, magnitude)
        augmented_images.append(np.array(augmented_img_pil))  # Convert back to numpy array

    return np.array(augmented_images)

# Example usage:
# Assume images is a numpy array of shape (batch_size, height, width, channels)
# augmented_images = augment_batch(images, num_transforms=3, magnitude=7)


def KeepOriginalAug(img):


  ori_h1,ori_w1,c = img.shape
  img  = cv2.resize(img, (512, 512))
  h1,w1,c = img.shape
  mask_zero = np.zeros((h1,w1, c), img.dtype)


  beta=1
  lam = np.random.beta(beta, beta)
  x1, y1,x2,y2 = saliency_bbox_return(img, lam)


  # if no saliency detected, then pick the next image
  if x1==x2 or y1==y2:
      return img



  mask_zero[x1: x2, y1: y2, :] = img[x1: x2, y1: y2,:]


  sizes= [(w1-((w1-y2)+(y2-y1)), h1-((h1-x2)+(x2-x1))),
  (w1-((w1-y2)+(y2-y1)), x2-x1),
  (w1-((w1-y2)+(y2-y1)), h1-x2),
  (y2-y1, h1-((h1-x2)+(x2-x1))),
  (y2-y1, h1-x2),
  (w1-y2, h1-((h1-x2)+(x2-x1))),
  (w1-y2, x2-x1),
  (w1-y2, h1-x2)
  ]
  areas=[]
  for sz in sizes:
      areas.append(sz[0]*sz[1])

  areas= np.array(areas)
  sizes = np.array(sizes)


  big  = 512
  #  size with its bound box (x1,x2,y1,y2)
  boxes = [[0,x1,0,y1],
              [x1,x2,0,y1],
              [x2,big,0,y1],
              [0,x1,y1,y2],
              [x2,big,y1,y2],
              [0,x1,y2,big],
              [x1,x2,y2,big],
              [x2,big,y2,big]
              ]
  boxes = np.array(boxes)



  idx = np.random.choice(list(range(len(areas[areas!=0]))))
  h2, w2 = sizes[areas!=0][idx]
  x12, x22, y12,y22= boxes[areas!=0][idx]

  resized  = cv2.resize(mask_zero[x1: x2, y1: y2, : ], (h2,w2))

  img = augment_batch(img.copy())
  img = np.reshape(img, (img.shape[0],img.shape[1],img.shape[2]))
  resized = augment_batch(resized.copy())
  # print(resized.shape)
  # resized = np.reshape(resized, (resized.shape[:-1]))
  img[x12: x22, y12: y22] = resized
  # augment(resized.copy())
  # self.tensor(self.auto(self.pil(resized)))

  return cv2.resize(img, (ori_h1,ori_w1))

def saliency_bbox_return(img, lam):
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # initialize OpenCV's static fine grained saliency detector and
    # compute the saliency map
    temp_img = img.copy()

    saliency = cv2.saliency.StaticSaliencyFineGrained_create()

    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")

    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2





# def augment(images):
#     # Input to `augment()` is a TensorFlow tensor which
#     # is not supported by `imgaug`. This is why we first
#     # convert it to its `numpy` variant.
#     rand_aug = iaa.RandAugment(n=3, m=7)
#     images = np.reshape(images, (1, images.shape[0], images.shape[1], images.shape[2]))

#     return rand_aug(images=images)

def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    r= np.random.uniform()
    x= KeepOriginalAug(image.img_to_array(img).astype('uint8'))
    # x= saliency_bbox(image.img_to_array(img).astype('uint8'))
    x= saliency_bbox(x)
    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity

# # Normalizing group contributions based on their size
# def calculate_normalized_within_group_diversity(groups, saliency_features):
#     unique_groups, group_counts = np.unique(groups, return_counts=True)
#     total_samples = len(groups)
#     group_diversities = {}

#     for group, count in zip(unique_groups, group_counts):
#         group_indices = np.where(groups == group)[0]
#         group_saliency_features = saliency_features[group_indices]
#         diversity = saliency_based_diversity(group_saliency_features)

#         # Normalize by the proportion of the group in the dataset
#         group_diversities[group] = (count / total_samples) * diversity

#     return group_diversities

# def calculate_inter_group_diversity(groups, saliency_features):
#     unique_groups = np.unique(groups)
#     inter_group_dists = []

#     for i in range(len(unique_groups)):
#         for j in range(i + 1, len(unique_groups)):
#             group_i_indices = np.where(groups == unique_groups[i])[0]
#             group_j_indices = np.where(groups == unique_groups[j])[0]
#             group_i_features = saliency_features[group_i_indices]
#             group_j_features = saliency_features[group_j_indices]
#             pairwise_distances = euclidean_distances(group_i_features, group_j_features)

#             # Normalize distances between 0 and 1
#             norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
#             inter_group_dists.append(np.mean(norm_distances))

#     return np.mean(inter_group_dists) if inter_group_dists else 0.0

# def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
#     group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
#     within_group_avg_diversity = np.sum(list(group_diversities.values()))  # Sum of normalized diversities
#     inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)
#     combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
#     return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

# def normalize_features(features):
#     return features / np.linalg.norm(features, axis=1, keepdims=True)

# def saliency_fairness_diversity_metric(image_set, groups):
#     feat_list = []
#     for img in image_set:
#         feat_list.append(reduce_dimensions(extract_features(img)))

#     # Normalize features before computing the diversity metric
#     normalized_features = normalize_features(np.array(feat_list))
#     return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities
def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0



def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)


print(os.listdir('./FacesCropped'))
for run in range(5):
  combined_metrics_results = []
  dataset_names = []
  diversity = []
  fairness = []
  for main_folder in os.listdir('FacesCropped'):
      if '.' not in main_folder:


          print(main_folder, ' = ')
          dataset_m = [f for f in glob(f'FacesCropped/' + main_folder  + '/Man/*')]
          dataset_f = [f for f in glob(f'FacesCropped/' + main_folder + '/Woman/*')]
          group_m = [0] * len(dataset_m)
          group_f = [1] * len(dataset_f)
          dataset = dataset_m + dataset_f
          groups = group_m + group_f
          combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(dataset, groups)
          combined_metrics_results.append(combined_metric)
          dataset_names.append(main_folder)
          diversity.append(within_group_avg_diversity)
          fairness.append(inter_group_avg_diversity)

  result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                            columns=['Dataset', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
  result_df.to_csv('./FacesCropped/Augmentation_all_dataset'+str(run)+".csv")


['Arabic-West Asia & North Africa', 'English - North America', 'English-West Europe', 'Hindi-South Asia', 'Indonesian-South East Asia', 'Mandarin-East Asia', 'Russian-East Europe', 'Spanish-Latin America', 'Swahili-Sub Saharan Africa', '.ipynb_checkpoints', '.ipynb_checkpoints_all_dataset.csv', 'Baseline_all_dataset_0.csv', 'Augmentation_all_dataset2.csv', 'Augmentation_all_dataset1.csv', 'Augmentation_all_dataset3.csv', 'Augmentation_all_dataset4.csv', 'Baseline_all_dataset1.csv', 'Baseline_all_dataset2.csv', 'Baseline_all_dataset3.csv', 'Baseline_all_dataset4.csv', 'Augmentation_all_dataset0.csv']
Arabic-West Asia & North Africa  = 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/s

# Baseline - Professions to Language_Location

In [None]:
cd /content/drive/MyDrive/Face Dataset

/content/drive/.shortcut-targets-by-id/1sw-jjmBRYby1-CEhKKOt6Tba4zKTnI2L/Face Dataset


In [None]:

import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm

# Saliency Function (Provided)
def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    x = saliency_bbox(image.img_to_array(img).astype('uint8'))

    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity

# # Normalizing group contributions based on their size
# def calculate_normalized_within_group_diversity(groups, saliency_features):
#     unique_groups, group_counts = np.unique(groups, return_counts=True)
#     total_samples = len(groups)
#     group_diversities = {}

#     for group, count in zip(unique_groups, group_counts):
#         group_indices = np.where(groups == group)[0]
#         group_saliency_features = saliency_features[group_indices]
#         diversity = saliency_based_diversity(group_saliency_features)

#         # Normalize by the proportion of the group in the dataset
#         group_diversities[group] = (count / total_samples) * diversity

#     return group_diversities

# def calculate_inter_group_diversity(groups, saliency_features):
#     unique_groups = np.unique(groups)
#     inter_group_dists = []

#     for i in range(len(unique_groups)):
#         for j in range(i + 1, len(unique_groups)):
#             group_i_indices = np.where(groups == unique_groups[i])[0]
#             group_j_indices = np.where(groups == unique_groups[j])[0]
#             group_i_features = saliency_features[group_i_indices]
#             group_j_features = saliency_features[group_j_indices]
#             pairwise_distances = euclidean_distances(group_i_features, group_j_features)

#             # Normalize distances between 0 and 1
#             norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
#             inter_group_dists.append(np.mean(norm_distances))

#     return np.mean(inter_group_dists) if inter_group_dists else 0.0

# def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
#     group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
#     within_group_avg_diversity = np.sum(list(group_diversities.values()))  # Sum of normalized diversities
#     inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)
#     combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
#     return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

# def normalize_features(features):
#     return features / np.linalg.norm(features, axis=1, keepdims=True)

# def saliency_fairness_diversity_metric(image_set, groups):
#     feat_list = []
#     print(len(image_set))
#     for img in image_set:
#         feat_list.append(reduce_dimensions(extract_features(img)))
#     print('Feature list ', len(feat_list))
#     # Normalize features before computing the diversity metric

#     normalized_features = normalize_features(np.array(feat_list))
#     return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities

def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0


def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

# Load datasets and compute diversity and fairness
print(os.listdir('Faces'))
for run in range(5):
  combined_metrics_results = []
  dataset_names = []
  diversity = []
  fairness = []
  for profession in os.listdir('Faces'):
      if '.' not in profession:
        groups =[]
        datasets = []
        count=0
        for LangLoc in os.listdir('Faces/' + profession):

          dataset_ = [f for f in glob(f'Faces/' + profession+'/'+LangLoc+'/*')]
          datasets = datasets+ dataset_


          group_ = [count] * len(dataset_)
          groups = groups+group_
          count+=1

        combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(datasets, groups)
        combined_metrics_results.append(combined_metric)
        dataset_names.append(profession)
        diversity.append(within_group_avg_diversity)
        fairness.append(inter_group_avg_diversity)

  result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                            columns=['Profession', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
  result_df.to_csv('FaceResults/' + "Baseline_profession_LangLoc"+str(run)+".csv")


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
['Nurse', 'Engineer', 'Politician', 'CEO', 'School Teacher']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/st

In [None]:
os.listdir('FaceResults')

['Baseline_profession_LangLoc1.csv',
 'Baseline_profession_LangLoc2.csv',
 'Baseline_profession_LangLoc3.csv',
 'Baseline_profession_LangLoc4.csv']

# Data Augmentation  -  Professions to Language_Location

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.decomposition import PCA
from glob import glob
import cv2
from sklearn.metrics.pairwise import euclidean_distances
from numpy.linalg import norm
import random


import numpy as np
import torch
from PIL import Image
import cv2
from imgaug import augmenters as iaa
from PIL import Image, ImageEnhance, ImageOps
import albumentations as A

import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
from PIL import Image, ImageEnhance, ImageOps

def apply_random_augmentation(image, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to an image.

    Args:
        image (PIL.Image.Image): Input image.
        num_transforms (int): Number of random transformations to apply.
        magnitude (int): Magnitude of the transformations.

    Returns:
        PIL.Image.Image: Augmented image.
    """
    augmentations = [
        ('flip_lr', lambda img: img.transpose(Image.FLIP_LEFT_RIGHT)),
        ('flip_ud', lambda img: img.transpose(Image.FLIP_TOP_BOTTOM)),
        ('rotate', lambda img: img.rotate(random.uniform(-magnitude*3, magnitude*3))),
        ('brightness', lambda img: ImageEnhance.Brightness(img).enhance(random.uniform(0.5, 1.5))),
        ('contrast', lambda img: ImageEnhance.Contrast(img).enhance(random.uniform(0.5, 1.5))),
        ('color', lambda img: ImageEnhance.Color(img).enhance(random.uniform(0.5, 1.5))),
        ('sharpness', lambda img: ImageEnhance.Sharpness(img).enhance(random.uniform(0.5, 2.0))),
        ('autocontrast', lambda img: ImageOps.autocontrast(img)),
        ('solarize', lambda img: ImageOps.solarize(img, threshold=random.uniform(128 - magnitude * 10, 128))),
        ('invert', lambda img: ImageOps.invert(img)),
    ]

    # Randomly select augmentations
    selected_augmentations = random.sample(augmentations, num_transforms)

    # Apply augmentations
    for name, aug_fn in selected_augmentations:
        image = aug_fn(image)

    return image

def augment_batch(images, num_transforms=3, magnitude=7):
    """
    Apply random augmentations to a batch of images.

    Args:
        images (numpy.ndarray): Array of images (H, W, C).
        num_transforms (int): Number of random transformations to apply per image.
        magnitude (int): Magnitude of the transformations.

    Returns:
        numpy.ndarray: Array of augmented images.
    """
    augmented_images = []
    for img_array in images:
        img_pil = Image.fromarray(img_array.astype('uint8'))  # Convert numpy array to PIL Image
        augmented_img_pil = apply_random_augmentation(img_pil, num_transforms, magnitude)
        augmented_images.append(np.array(augmented_img_pil))  # Convert back to numpy array

    return np.array(augmented_images)

# Example usage:
# Assume images is a numpy array of shape (batch_size, height, width, channels)
# augmented_images = augment_batch(images, num_transforms=3, magnitude=7)


def KeepOriginalAug(img):


  ori_h1,ori_w1,c = img.shape
  img  = cv2.resize(img, (512, 512))
  h1,w1,c = img.shape
  mask_zero = np.zeros((h1,w1, c), img.dtype)


  beta=1
  lam = np.random.beta(beta, beta)
  x1, y1,x2,y2 = saliency_bbox_return(img, lam)


  # if no saliency detected, then pick the next image
  if x1==x2 or y1==y2:
      return img



  mask_zero[x1: x2, y1: y2, :] = img[x1: x2, y1: y2,:]


  sizes= [(w1-((w1-y2)+(y2-y1)), h1-((h1-x2)+(x2-x1))),
  (w1-((w1-y2)+(y2-y1)), x2-x1),
  (w1-((w1-y2)+(y2-y1)), h1-x2),
  (y2-y1, h1-((h1-x2)+(x2-x1))),
  (y2-y1, h1-x2),
  (w1-y2, h1-((h1-x2)+(x2-x1))),
  (w1-y2, x2-x1),
  (w1-y2, h1-x2)
  ]
  areas=[]
  for sz in sizes:
      areas.append(sz[0]*sz[1])

  areas= np.array(areas)
  sizes = np.array(sizes)


  big  = 512
  #  size with its bound box (x1,x2,y1,y2)
  boxes = [[0,x1,0,y1],
              [x1,x2,0,y1],
              [x2,big,0,y1],
              [0,x1,y1,y2],
              [x2,big,y1,y2],
              [0,x1,y2,big],
              [x1,x2,y2,big],
              [x2,big,y2,big]
              ]
  boxes = np.array(boxes)



  idx = np.random.choice(list(range(len(areas[areas!=0]))))
  h2, w2 = sizes[areas!=0][idx]
  x12, x22, y12,y22= boxes[areas!=0][idx]

  resized  = cv2.resize(mask_zero[x1: x2, y1: y2, : ], (h2,w2))

  img = augment_batch(img.copy())
  img = np.reshape(img, (img.shape[0],img.shape[1],img.shape[2]))
  resized = augment_batch(resized.copy())
  # print(resized.shape)
  # resized = np.reshape(resized, (resized.shape[:-1]))
  img[x12: x22, y12: y22] = resized
  # augment(resized.copy())
  # self.tensor(self.auto(self.pil(resized)))

  return cv2.resize(img, (ori_h1,ori_w1))

def saliency_bbox_return(img, lam):
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # initialize OpenCV's static fine grained saliency detector and
    # compute the saliency map
    temp_img = img.copy()

    saliency = cv2.saliency.StaticSaliencyFineGrained_create()

    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")

    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2





# def augment(images):
#     # Input to `augment()` is a TensorFlow tensor which
#     # is not supported by `imgaug`. This is why we first
#     # convert it to its `numpy` variant.
#     rand_aug = iaa.RandAugment(n=3, m=7)
#     images = np.reshape(images, (1, images.shape[0], images.shape[1], images.shape[2]))

#     return rand_aug(images=images)

def saliency_bbox(img):
    beta = 1
    lam = np.random.beta(beta, beta)
    size = img.shape
    W = size[1]
    H = size[0]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    temp_img = img.copy()
    saliency = cv2.saliency.StaticSaliencyFineGrained_create()
    (success, saliencyMap) = saliency.computeSaliency(temp_img)
    saliencyMap = (saliencyMap * 255).astype("uint8")
    maximum_indices = np.unravel_index(np.argmax(saliencyMap, axis=None), saliencyMap.shape)
    x = maximum_indices[0]
    y = maximum_indices[1]

    bbx1 = np.clip(x - cut_w // 2, 0, W)
    bby1 = np.clip(y - cut_h // 2, 0, H)
    bbx2 = np.clip(x + cut_w // 2, 0, W)
    bby2 = np.clip(y + cut_h // 2, 0, H)
    x1, y1, x2, y2 = bbx1, bby1, bbx2, bby2
    return img[x1: x2, y1: y2, :]


# Load VGG16 for feature extraction
model = VGG16(weights="imagenet", include_top=False)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(650, 500))
    r= np.random.uniform()
    x= KeepOriginalAug(image.img_to_array(img).astype('uint8'))
    # x= saliency_bbox(image.img_to_array(img).astype('uint8'))
    x= saliency_bbox(x)
    # Check if the saliency_bbox returned a valid region
    if x is None or x.size == 0:
        print(f"Warning: No valid saliency region found for {img_path}. Using original image.")
        x = image.img_to_array(img)  # Use the original image if saliency fails

    x = cv2.resize(x, (500, 650))
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)

def reduce_dimensions(features):
    pca = PCA()
    features = features.reshape(20, 15 * 512)
    pca.fit(features)
    features_trans = pca.transform(features)
    return np.squeeze(features_trans.reshape(1, 400))

# Saliency-Based Diversity with Fairness integration (with normalization)
def saliency_based_diversity(X):
    pairwise_distances = euclidean_distances(X)
    N = X.shape[0]
    diversity = np.sum(pairwise_distances) / (N * (N - 1))

    # Normalization step: divide by max possible distance
    max_distance = np.sqrt(np.sum((np.max(X, axis=0) - np.min(X, axis=0)) ** 2))
    normalized_diversity = diversity / max_distance if max_distance > 0 else diversity
    return normalized_diversity

# # Normalizing group contributions based on their size
# def calculate_normalized_within_group_diversity(groups, saliency_features):
#     unique_groups, group_counts = np.unique(groups, return_counts=True)
#     total_samples = len(groups)
#     group_diversities = {}

#     for group, count in zip(unique_groups, group_counts):
#         group_indices = np.where(groups == group)[0]
#         group_saliency_features = saliency_features[group_indices]
#         diversity = saliency_based_diversity(group_saliency_features)

#         # Normalize by the proportion of the group in the dataset
#         group_diversities[group] = (count / total_samples) * diversity

#     return group_diversities

# def calculate_inter_group_diversity(groups, saliency_features):
#     unique_groups = np.unique(groups)
#     inter_group_dists = []

#     for i in range(len(unique_groups)):
#         for j in range(i + 1, len(unique_groups)):
#             group_i_indices = np.where(groups == unique_groups[i])[0]
#             group_j_indices = np.where(groups == unique_groups[j])[0]
#             group_i_features = saliency_features[group_i_indices]
#             group_j_features = saliency_features[group_j_indices]
#             pairwise_distances = euclidean_distances(group_i_features, group_j_features)

#             # Normalize distances between 0 and 1
#             norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
#             inter_group_dists.append(np.mean(norm_distances))

#     return np.mean(inter_group_dists) if inter_group_dists else 0.0

# def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
#     group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
#     within_group_avg_diversity = np.sum(list(group_diversities.values()))  # Sum of normalized diversities
#     inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)
#     combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
#     return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

# def normalize_features(features):
#     return features / np.linalg.norm(features, axis=1, keepdims=True)

# def saliency_fairness_diversity_metric(image_set, groups):
#     feat_list = []
#     for img in image_set:
#         feat_list.append(reduce_dimensions(extract_features(img)))

#     # Normalize features before computing the diversity metric
#     normalized_features = normalize_features(np.array(feat_list))
#     return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_normalized_within_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    group_diversities = {}

    for group in unique_groups:
        group_indices = np.where(groups == group)[0]
        group_features = saliency_features[group_indices]
        pairwise_distances = euclidean_distances(group_features)

        # Normalize distances between 0 and 1
        norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
        group_diversities[group] = np.mean(norm_distances[np.triu_indices(len(group_features), k=1)]) if len(group_features) > 1 else 0

    return group_diversities

# def calculate_inter_group_diversity(groups, saliency_features):
#     unique_groups = np.unique(groups)
#     inter_group_dists = []

#     # Get the number of samples per group to handle imbalance
#     group_sizes = {group: np.sum(groups == group) for group in unique_groups}

#     for i in range(len(unique_groups)):
#         for j in range(i + 1, len(unique_groups)):
#             group_i_indices = np.where(groups == unique_groups[i])[0]
#             group_j_indices = np.where(groups == unique_groups[j])[0]
#             group_i_features = saliency_features[group_i_indices]
#             group_j_features = saliency_features[group_j_indices]
#             pairwise_distances = euclidean_distances(group_i_features, group_j_features)

#             # Normalize distances between 0 and 1
#             norm_distances = (pairwise_distances - np.min(pairwise_distances)) / np.ptp(pairwise_distances)
#             weighted_dist = np.mean(norm_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
#             inter_group_dists.append(weighted_dist)

#     return np.sum(inter_group_dists) / (sum(group_sizes.values()) ** 2 - sum(group_sizes.values())) if inter_group_dists else 0.0

def calculate_inter_group_diversity(groups, saliency_features):
    unique_groups = np.unique(groups)
    inter_group_dists = []

    # Get the number of samples per group to handle imbalance
    group_sizes = {group: np.sum(groups == group) for group in unique_groups}

    for i in range(len(unique_groups)):
        for j in range(i + 1, len(unique_groups)):
            group_i_indices = np.where(groups == unique_groups[i])[0]
            group_j_indices = np.where(groups == unique_groups[j])[0]
            group_i_features = saliency_features[group_i_indices]
            group_j_features = saliency_features[group_j_indices]


            pairwise_distances = euclidean_distances(group_i_features, group_j_features)

            weighted_dist = np.mean(pairwise_distances) * group_sizes[unique_groups[i]] * group_sizes[unique_groups[j]]
            inter_group_dists.append(weighted_dist)

    denominator = (sum(group_sizes.values()) ** 2 - sum(group_sizes.values()))
    return np.sum(inter_group_dists) / denominator if denominator > 0 else 0.0


def combined_fairness_diversity_metrics(groups, saliency_features, alpha=0.5, beta=0.5):
    # Calculate group sizes for weighting within-group diversity
    group_sizes = {group: np.sum(groups == group) for group in np.unique(groups)}
    total_samples = sum(group_sizes.values())

    # Calculate within-group diversity, weighted by group size
    group_diversities = calculate_normalized_within_group_diversity(groups, saliency_features)
    within_group_avg_diversity = np.sum([group_sizes[group] * group_diversities[group] for group in group_diversities]) / total_samples

    # Calculate inter-group diversity with class imbalance weighting
    inter_group_avg_diversity = calculate_inter_group_diversity(groups, saliency_features)

    # Combined metric using weighted within-group and inter-group diversities
    combined_metric = alpha * within_group_avg_diversity + beta * inter_group_avg_diversity
    return combined_metric, within_group_avg_diversity, inter_group_avg_diversity

def normalize_features(features):
    return features / np.linalg.norm(features, axis=1, keepdims=True)

def saliency_fairness_diversity_metric(image_set, groups):
    feat_list = []
    for img in image_set:
        feat_list.append(reduce_dimensions(extract_features(img)))

    # Normalize features before computing the diversity metric
    normalized_features = normalize_features(np.array(feat_list))
    return combined_fairness_diversity_metrics(groups, normalized_features, alpha=0.5, beta=0.5)

# Load datasets and compute diversity and fairness
print(os.listdir('Faces'))
for run in range(5):
  combined_metrics_results = []
  dataset_names = []
  diversity = []
  fairness = []
  for profession in os.listdir('Faces'):
      if '.' not in profession:
        groups =[]
        datasets = []
        count=0
        for LangLoc in os.listdir('Faces/' + profession):

          dataset_ = [f for f in glob(f'Faces/' + profession+'/'+LangLoc+'/*')]
          datasets = datasets+ dataset_


          group_ = [count] * len(dataset_)
          groups = groups+group_
          count+=1

        combined_metric, within_group_avg_diversity, inter_group_avg_diversity = saliency_fairness_diversity_metric(datasets, groups)
        combined_metrics_results.append(combined_metric)
        dataset_names.append(profession)
        diversity.append(within_group_avg_diversity)
        fairness.append(inter_group_avg_diversity)

  result_df = pd.DataFrame(list(zip(dataset_names, diversity, fairness, combined_metrics_results)),
                            columns=['Profession', 'WithinGroupDiversity', 'InterGroupDiversity', 'CombinedFairnessDiversityScore'])
  result_df.to_csv('FaceResults/' + "Augmentation_profession_LangLoc"+str(run)+".csv")


  check_for_updates()


['Nurse', 'Engineer', 'Politician', 'CEO', 'School Teacher']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[

In [None]:

import os
import pandas as pd

# Initialize lists to store DataFrames for all runs
results = []

# Loop through each run to read metrics from existing CSVs for Baseline and Augmentation
for run in range(5):
    # Define the path to the CSV file for the current run (Baseline)
    csv_file_path_baseline = f'FaceResults/Baseline_profession_LangLoc{run}.csv'
    # Define the path to the CSV file for the current run (Augmentation)
    csv_file_path_aug = f'FaceResults/Augmentation_profession_LangLoc{run}.csv'

    # Read the Baseline CSV file into a DataFrame
    if os.path.exists(csv_file_path_baseline):
        result_df_baseline = pd.read_csv(csv_file_path_baseline)

    # Read the Augmentation CSV file into a DataFrame
    if os.path.exists(csv_file_path_aug):
        result_df_aug = pd.read_csv(csv_file_path_aug)

    # Combine metrics from Baseline and Augmentation
    if os.path.exists(csv_file_path_baseline) and os.path.exists(csv_file_path_aug):
        # Merge the two DataFrames on the 'Profession' column
        combined_df = pd.merge(result_df_baseline, result_df_aug, on='Profession', suffixes=('_baseline', '_aug'))

        # Append to the results list
        results.append(combined_df)

# Combine results from all runs into a single DataFrame
all_results_df = pd.concat(results, ignore_index=True)

# Group by Profession and calculate mean and std for the metrics for both Baseline and Augmentation
summary_df = all_results_df.groupby('Profession').agg({
    'WithinGroupDiversity_baseline': ['mean', 'std'],
    'InterGroupDiversity_baseline': ['mean', 'std'],
    'CombinedFairnessDiversityScore_baseline': ['mean', 'std'],
    'WithinGroupDiversity_aug': ['mean', 'std'],
    'InterGroupDiversity_aug': ['mean', 'std'],
    'CombinedFairnessDiversityScore_aug': ['mean', 'std']
}).reset_index()

# Create a new DataFrame to hold the formatted results
final_summary_df = pd.DataFrame()

# Format mean ± std for each metric
final_summary_df['Profession'] = summary_df['Profession']
final_summary_df['WithinGroupDiversity (Baseline)'] = summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity_baseline']['mean']:.2f} \pm {row['WithinGroupDiversity_baseline']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Baseline)'] = summary_df.apply(
    lambda row: f"{row['InterGroupDiversity_baseline']['mean']:.2f} \pm {row['InterGroupDiversity_baseline']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Baseline)'] = summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore_baseline']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore_baseline']['std']:.2f}", axis=1
)
final_summary_df['WithinGroupDiversity (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['WithinGroupDiversity_aug']['mean']:.2f} \pm {row['WithinGroupDiversity_aug']['std']:.2f}", axis=1
)
final_summary_df['InterGroupDiversity (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['InterGroupDiversity_aug']['mean']:.2f} \pm {row['InterGroupDiversity_aug']['std']:.2f}", axis=1
)
final_summary_df['CombinedFairnessDiversityScore (Augmentation)'] = summary_df.apply(
    lambda row: f"{row['CombinedFairnessDiversityScore_aug']['mean']:.2f} \pm {row['CombinedFairnessDiversityScore_aug']['std']:.2f}", axis=1
)

# Create LaTeX table format
latex_table = r"""
\begin{table*}
\centering
\caption{Gender Diversity and Fairness Metrics for Different Professions across Language Location pairs}
\label{tab:diversity_fairness_professions}
\begin{tabular}{lccc|ccc}
\toprule
Profession &
\multicolumn{3}{c|}{Baseline} & \multicolumn{3}{c}{With FaceKeepOriginalAugmentaiton} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ \\
 \midrule
"""

# Append data rows
for index, row in final_summary_df.iterrows():
    latex_table += f"{row['Profession']} & {row['WithinGroupDiversity (Baseline)']} & {row['InterGroupDiversity (Baseline)']} & {row['CombinedFairnessDiversityScore (Baseline)']} & {row['WithinGroupDiversity (Augmentation)']} & {row['InterGroupDiversity (Augmentation)']} & {row['CombinedFairnessDiversityScore (Augmentation)']} \\\\\n"

latex_table += r"""\bottomrule
\end{tabular}
\end{table*}
"""

# Save the LaTeX table to a .tex file
# with open('diversity_fairness_professions.tex', 'w') as f:
#     f.write(latex_table)

print("LaTeX table saved successfully.")
print(latex_table)

LaTeX table saved successfully.

\begin{table*}
\centering
\caption{Gender Diversity and Fairness Metrics for Different Professions across Language Location pairs}
\label{tab:diversity_fairness_professions}
\begin{tabular}{lccc|ccc}
\toprule
Profession &
\multicolumn{3}{c|}{Baseline} & \multicolumn{3}{c}{With FaceKeepOriginalAugmentaiton} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ & $D_{within}$ & $D_{inter}$ & $M_{\text{fairness-diversity}}$ \\
 \midrule
CEO & 0.83 \pm 0.01 & 0.61 \pm 0.00 & 0.72 \pm 0.00 & 0.86 \pm 0.00 & 0.63 \pm 0.00 & 0.74 \pm 0.00 \\
Engineer & 0.83 \pm 0.01 & 0.62 \pm 0.00 & 0.73 \pm 0.00 & 0.85 \pm 0.00 & 0.63 \pm 0.00 & 0.74 \pm 0.00 \\
Nurse & 0.82 \pm 0.01 & 0.61 \pm 0.00 & 0.72 \pm 0.01 & 0.86 \pm 0.00 & 0.63 \pm 0.00 & 0.74 \pm 0.00 \\
Politician & 0.82 \pm 0.00 & 0.61 \pm 0.00 & 0.72 \pm 0.00 & 0.86 \pm 0.00 & 0.62 \pm 0.00 & 0.74 \pm 0.00 \\
School Teacher & 0.83 \pm 0.00 & 0.62 \pm 0.00 & 0.7