In [11]:
import clip
import torch
import numpy as np
from tqdm import tqdm
import os
from PIL import Image

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
device

'cpu'

In [13]:
labels = ['black', 'white', 'asian', 'indian']
tkns = ['A photo of a person of color ' + label for label in labels]
text = clip.tokenize(tkns).to(device)

In [14]:
BATCH_SIZE = 100000

dir_path = r'/Users/hanselblanco/Documents/4to/ML/UTKFace/UTKFace'
ln = 0
photo_paths = os.listdir(dir_path)

for path in photo_paths:
    if os.path.isfile(os.path.join(dir_path, path)):
        ln += 1


In [15]:
results = []
photos_to_analize = 20000

for i in tqdm(range(0, ln, BATCH_SIZE)):
    images = [preprocess(Image.open(dir_path + '/' + photo_paths[j])) for j in range(photos_to_analize)]
    image_input = torch.tensor(np.stack(images)).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        logits_per_image, logits_per_text = model(image_input, text)
        # The softmax function takes the original confidence and applys a transform to make all the confidence add up to one
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        results.append(probs)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
res = np.concatenate(results,axis=0)
choices = np.argmax(res,axis=1)
choices.shape

(2000,)

In [None]:
getlabel = lambda x:labels[x]
vgetlabel = np.vectorize(getlabel)
colors = vgetlabel(choices)
colors

array(['asian', 'black', 'white', ..., 'asian', 'white', 'asian'],
      dtype='<U6')

### Selection Rate (Positive results / N)

In [None]:
race_code = { 0 : 'white', 1 : 'black', 2 : 'asian', 3 : 'indian', 4 : 'others'}

positive_whites, positive_blacks, positive_asians, positive_indians = 0, 0, 0, 0

total_whites, total_blacks, total_asians, total_indians = 0, 0, 0, 0

for i in range(photos_to_analize):
    data = photo_paths[i].split('_')
    race_number = int(data[2])
    match race_code[race_number]:
        case 'white':
            total_whites += 1
            if colors[i] == 'white':
                positive_whites += 1
        case 'black':
            total_blacks += 1
            if colors[i] == 'black':
                positive_blacks += 1
        case 'asian':
            total_asians += 1
            if colors[i] == 'asian':
                positive_asians += 1
        case 'indian':
            total_indians += 1
            if colors[i] == 'indian':
                positive_indians += 1
        case default:
            continue
                
whites_sr, blacks_sr, asians_sr, indians_sr = positive_whites/ total_whites, positive_blacks/ total_blacks, positive_asians/ total_asians, positive_indians/ total_indians

whites_sr, blacks_sr, asians_sr, indians_sr
    

(0.9336437718277066, 0.787012987012987, 0.9480968858131488, 0.8012048192771084)

#### Disparate impact

In [None]:

# disparate impact ratio = underprivileged group SR / privileged group SR
disp_impact_b_w = blacks_sr/ whites_sr
disp_impact_b_a = blacks_sr/ asians_sr
disp_impact_b_i = blacks_sr/ indians_sr
disp_impact_b_w, disp_impact_b_a, disp_impact_b_i

(0.8429478252420897, 0.8300976395866907, 0.9822868860462847)

In [None]:
if disp_impact_b_w < 0.8:
    print('Disparate impact present in black group / white group')
if disp_impact_b_a < 0.8:
    print('Disparate impact present in asian group / white group')
if disp_impact_b_i < 0.8:
    print('Disparate impact present in indian group / white group')
