In [14]:
import clip
import torch
import numpy as np
from tqdm import tqdm
import os
from PIL import Image

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
device

'cpu'

In [16]:
labels = ['male', 'female']
tkns = ['A photo of a person of sex ' + label for label in labels]
text = clip.tokenize(tkns).to(device)

In [17]:
BATCH_SIZE = 100000

dir_path = r'/Users/hanselblanco/Documents/4to/ML/UTKFace/UTKFace'
ln = 0
photo_paths = os.listdir(dir_path)

for path in photo_paths:
    if os.path.isfile(os.path.join(dir_path, path)):
        ln += 1


In [18]:
results = []
photos_to_analize = 4000

for i in tqdm(range(0, ln, BATCH_SIZE)):
    images = [preprocess(Image.open(dir_path + '/' + photo_paths[j])) for j in range(photos_to_analize)]
    image_input = torch.tensor(np.stack(images)).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        logits_per_image, logits_per_text = model(image_input, text)
        # The softmax function takes the original confidence and applys a transform to make all the confidence add up to one
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        results.append(probs)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
res = np.concatenate(results,axis=0)
choices = np.argmax(res,axis=1)
choices.shape

(2000,)

In [None]:
getlabel = lambda x:labels[x]
vgetlabel = np.vectorize(getlabel)
genders = vgetlabel(choices)
genders

array(['female', 'male', 'female', ..., 'female', 'male', 'female'],
      dtype='<U6')

### Selection Rate (Positive results / N)

In [None]:
gender_code = { 0 : 'male', 1 : 'female'}

positive_males, positive_females = 0, 0

total_males, total_females = 0, 0

for i in range(photos_to_analize):
    data = photo_paths[i].split('_')
    gender_number = int(data[1])
    match gender_code[gender_number]:
        case 'male':
            total_males += 1
            if genders[i] == 'male':
                positive_males += 1
        case 'female':
            total_females += 1
            if genders[i] == 'female':
                positive_females += 1
                
males_sr, females_sr = positive_males/ total_males, positive_females/ total_females

males_sr, females_sr
    

(0.9498580889309366, 0.9554612937433722)

#### Disparate impact

In [None]:

# disparate impact ratio = underprivileged group SR / privileged group SR
disp_impact = females_sr / males_sr
disp_impact

0.9941356025104031

In [None]:
if disp_impact < 0.8:
    print('Disparate impact present in female group / male group')
