In [1]:
import clip
import numpy as np
import torch
from pkg_resources import packaging
from PIL import Image
print("Torch version:", torch.__version__)
# from IPython.display import Image, display
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
clip.available_models()
from torch.utils.data import DataLoader
import random
import pickle
import pandas as pd
import flickr_dataset as fl 
import seaborn as sns
sns.set_style("darkgrid")
from scipy import stats
import sys
sys.path.insert(0, '../')
import utils as ut
import importlib
from scipy.special import softmax
import csv
import matplotlib
%matplotlib inline

Torch version: 1.13.1


In [2]:
importlib.reload(ut)

<module 'utils' from '/mnt/efs/fairclip/FinalCode/Flickr30K/../utils.py'>

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


In [4]:
device = "cuda" #if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device=device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [5]:
importlib.reload(fl)
flickrData = fl.MyFlickr30k('../../flicker30k-images/flickr30k-images', '../../flicker30k-captions/results_20130124.token',None, transform=preprocess)
train_size = int(0.5 * len(flickrData))
test_size = len(flickrData) - train_size
torch.manual_seed(0)
set_seed(0)
flickrData_train, flickrData_test = torch.utils.data.random_split(flickrData, [train_size, test_size])
print(len(flickrData_test), len(flickrData_train))

Random seed set as 0
15507 15507


In [6]:
importlib.reload(fl)
flickrData_orig = fl.MyFlickr30k('../../flicker30k-images/flickr30k-images', '../../flicker30k-captions/results_20130124.token',None)
train_size = int(0.5 * len(flickrData))
test_size = len(flickrData_orig) - train_size
torch.manual_seed(0)
flickrData_train_orig, flickrData_test_orig = torch.utils.data.random_split(flickrData_orig, [train_size, test_size])
print(len(flickrData_test_orig), len(flickrData_train_orig))

15507 15507


In [None]:
all_features_train, all_labels_captions_train, all_labels_gender_train = ut.get_features_flickr(flickrData_train, model, device)

 77%|█████████████████████████████████████████████████████████████▌                  | 120/156 [01:28<00:25,  1.39it/s]

In [None]:
all_features_test, all_labels_captions_test, all_labels_gender_test = ut.get_features_flickr(flickrData_test, model, device)

In [221]:
# test Karpathy We do not use the Karpathy split but we provide the code. Just uncomment the following lines
# and change the variable all_features_test to all_features_test_k 
# importlib.reload(fl)
# flickrData = fl.MyFlickr30k('flicker30k-images/flickr30k-images', 'flicker30k-captions/results_20130124.token','test', transform=preprocess)
# all_features_test_k, all_labels_captions_test_k, all_labels_gender_test_k = ut.get_features_flickr(flickrData, model, device)
# all_features_test_k /= all_features_test_k.norm(dim=-1, keepdim=True)

In [None]:
all_features_test /= all_features_test.norm(dim=-1, keepdim=True)
all_features_train /= all_features_train.norm(dim=-1, keepdim=True)
protected_attribute = {'gender':all_labels_gender_test}

In [None]:
# num_gender_queries = 5

queries = [ "doctor", "nurse", "secretary", 'boss', 'lawyer', 'paralegal']

text_inputs = torch.cat([clip.tokenize(f"This is a photo of a {word}") for word in queries]).to(device)

with torch.no_grad():
    text_features = model.encode_text(text_inputs)#.float()
text_features /= text_features.norm(dim=-1, keepdim=True)

similarity = (100.0 * all_features_test @ text_features.T).cpu().numpy().astype(np.float64).T
ut.calc_similarity_diff('orig_flickr', 'gender', queries, protected_attribute , {0: 'Female', 1:'Male'},similarity)
ut.run_anova(queries, all_labels_gender_test, similarity, 'orig_flickr', skip_att = 2)
ut.run_skew(queries, all_labels_gender_test, similarity, 'orig_flickr', [10, 20, 30], skip_attr = 2)
ut.run_retrieval_metric(queries, all_labels_gender_test, similarity, 'orig_flickr', [10, 20, 30], skip_attr = 2)

In [None]:
#recall test Karpathy
# import time
# start = time.time()
# flat_captions = all_labels_captions_test_k.flatten()
# similarity = []
# for cap in flat_captions:
#     #cap[:300]
#     text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)
# # for cap in all_labels_captions_test_k: #flat_captions:
    
# #     text_inputs = clip.tokenize(f"a photo of {''.join(cap)}", truncate = True).to(device)

#     with torch.no_grad():
#         text_features = model.encode_text(text_inputs)#.float()
#     text_features /= text_features.norm(dim=-1, keepdim=True)

#     similarity.append((100.0 * all_features_test_k @ text_features.T).cpu().numpy().astype(np.float64))
# similarity = np.asarray(similarity).squeeze()
# print(similarity.shape)
# # similarity = np.asarray(similarity).reshape((77535,15507))
# ut.calculate_recall(similarity, "orig_flickr_karp")
# end = time.time()
# print(end - start)
# using all the descriptions joined 
# mean_top_1  mean_top_5  mean_top_10
# 0        0.79        0.97         0.99

In [11]:
#recall 
importlib.reload(ut)
import time
start = time.time()
flat_captions = all_labels_captions_test.flatten()
similarity = []
for cap in flat_captions:
    #cap[:300]
    text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)
# for cap in all_labels_captions_test: #flat_captions:
    
#     text_inputs = clip.tokenize(f"a photo of {''.join(cap)}", truncate = True).to(device)

    with torch.no_grad():
        text_features = model.encode_text(text_inputs)#.float()
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity.append((100.0 * all_features_test @ text_features.T).cpu().numpy().astype(np.float64))
similarity = np.asarray(similarity).squeeze()
print(similarity.shape)
# similarity = np.asarray(similarity).reshape((77535,15507 ))
ut.calculate_recall(similarity, "orig_flickr")
end = time.time()
print(end - start)

(77535, 15507)
   mean_top_1  mean_top_5  mean_top_10
0       0.315       0.548        0.647
667.9809448719025


In [29]:
0.315       0.548        0.647

(77535,)

In [None]:
classification_tasks = {}
classification_tasks['gender'] = [
    ('criminal','innocent person'),
    ('nurse','doctor'),
    ('prostitute','night owl'),
    ('CEO','secretary'),
    ('leader','follower'),
    ('lawyer','paralegal'),
    ('secretary','board member'),
    ('convict',	'exoneree'),
]

In [None]:
for attr in ['gender']:
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    temp = np.zeros((len(classification_tasks[attr]),2))
    
    for cc, task in enumerate(classification_tasks[attr]):
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task]).to(device)
        with torch.no_grad():
            text_features = model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * all_features_test @ text_features.T).softmax(dim=-1).cpu().numpy().astype(np.float64)
        predictions = np.argmax(similarity,axis=1)
        for ell in range(2):
#             print(ell)
            temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test==ell]),2)
    columns= ['Female', 'Male']
    temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
    if attr == 'gender':	  
        temp['Disparity'] = temp['Male'] - temp['Female']
    elif attr == 'race':
        temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
    temp.to_csv(f"../results_csv/{attr}_flickr_clf_orig.csv")
    print(temp)

In [None]:
projection_GT,projection_inferred, MI_GT, MI_inferred = ut.calculate_projections_flickr(all_features_train, all_labels_gender_train, model, device)

# Fair PCA

In [None]:
print("======== Running Fair pca G.T on the model ============== ")
for attr in ['gender']:
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in queries]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    projection_train = projection_GT[attr]
    all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))
    text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
    similarity = (100.0 * all_features_val_transf @ text_features_pca.T).T
    ut.calc_similarity_diff('fpca_gt_flickr', 'gender',queries, protected_attribute , {0: 'Female', 1:'Male'},similarity)
    ut.run_anova(queries, all_labels_gender_test, similarity, 'fpca_gt_flickr', skip_att = 2)
    ut.run_skew(queries, all_labels_gender_test, similarity, 'fpca_gt_flickr', [10, 20, 30],skip_attr = 2)
    ut.run_retrieval_metric(queries, all_labels_gender_test, similarity, 'fpca_gt_flickr',[10, 20, 30], skip_attr = 2)
    

In [None]:
print("======== Running CLF Fair pca G.T on the model ============== ")
for attr in ['gender']:
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    temp = np.zeros((len(classification_tasks[attr]),2))
    
    for cc, task in enumerate(classification_tasks[attr]):
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task]).to(device)
        with torch.no_grad():
            text_features = model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        projection_train = projection_GT[attr]
        all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))
        text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
        similarity = softmax(100.0 * np.matmul(all_features_val_transf, np.transpose(text_features_pca)),axis=1)
        
        predictions = np.argmax(similarity,axis=1)
        for ell in range(2):
#             print(ell)
            temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test==ell]),2)
    columns= ['Female', 'Male']
    temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
    if attr == 'gender':	  
        temp['Disparity'] = temp['Male'] - temp['Female']
    elif attr == 'race':
        temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
    temp.to_csv(f"../results_csv/{attr}_flickr_clf_fpca_gt.csv")
    print(temp)

In [None]:
print("======== Running Recall Fair pca with G.T attribute ============== ")
import time
start = time.time()
flat_captions = all_labels_captions_test.flatten()
similarity = []
projection_train = projection_GT['gender']
all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))

for cap in flat_captions:
    text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)
# for cap in all_labels_captions_test: #flat_captions:
    
#     text_inputs = clip.tokenize(f"a photo of {''.join(cap)}", truncate = True).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)#.float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
    similarity.append(100.0 * all_features_val_transf @ text_features_pca.T)
similarity = np.asarray(similarity).squeeze()#reshape((77535,15507 ))
ut.calculate_recall(similarity, "fpca_gt_flickr")
end = time.time()
print(end - start)


In [None]:
print("======== Running Fair pca INF on the model ============== ")
for attr in ['gender']:
    
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in queries]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    projection_train = projection_inferred[attr]
    all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))
    text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
    similarity = (100.0 * all_features_val_transf @ text_features_pca.T).T
    ut.calc_similarity_diff('fpca_inf_flickr', 'gender', queries, protected_attribute , {0: 'Female', 1:'Male'},similarity)
    ut.run_anova(queries, all_labels_gender_test, similarity, 'fpca_inf_flickr', skip_att = 2)
    ut.run_skew(queries, all_labels_gender_test, similarity, 'fpca_inf_flickr',[10, 20, 30], skip_attr = 2)
    ut.run_retrieval_metric(queries, all_labels_gender_test, similarity, 'fpca_inf_flickr',[10, 20, 30], skip_attr = 2)
    

In [None]:
print("======== Running CLF Fair pca inf on the model ============== ")
for attr in ['gender']:
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    temp = np.zeros((len(classification_tasks[attr]),2))
    
    for cc, task in enumerate(classification_tasks[attr]):
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task]).to(device)
        with torch.no_grad():
            text_features = model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        projection_train = projection_inferred[attr]
        all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))
        text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
        similarity = softmax(100.0 * np.matmul(all_features_val_transf, np.transpose(text_features_pca)),axis=1)
        
        predictions = np.argmax(similarity,axis=1)
        for ell in range(2):
#             print(ell)
            temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test==ell]),2)
    columns= ['Female', 'Male']
    temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
    if attr == 'gender':	  
        temp['Disparity'] = temp['Male'] - temp['Female']
    elif attr == 'race':
        temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
    temp.to_csv(f"../results_csv/{attr}_flickr_clf_fpca_inf.csv")
    print(temp)

In [None]:
print("======== Running Recall pca Inferred on the model ============== ")
import time
start = time.time()
flat_captions = all_labels_captions_test.flatten()
similarity = []
projection_train = projection_inferred['gender']
all_features_val_transf = projection_train.just_transform(all_features_test.cpu().numpy().astype(np.float64))

for cap in flat_captions:
    text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)

    with torch.no_grad():
        text_features = model.encode_text(text_inputs)#.float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features_pca = projection_train.just_transform(text_features.cpu().numpy().astype(np.float64))
    similarity.append(100.0 * all_features_val_transf @ text_features_pca.T)
similarity = np.asarray(similarity).squeeze()#reshape((77535,15507 ))
ut.calculate_recall(similarity, "fpca_inf_flickr")
end = time.time()
print(end - start)

In [None]:
0       0.289       0.513        0.612

# Clip-clip https://arxiv.org/abs/2109.05433

In [None]:
print("======== Running MI G.T on the model ============== ")
for attr in ['gender']:
    
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in queries]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features = text_features.cpu().numpy().astype(np.float64)
    num_clip_s = [400, 256]
    mis = MI_GT[attr]
    for num_clip in num_clip_s:
        print(f"..... {num_clip}.........")
        
        text_features_mi =text_features[:, mis[:num_clip]]
        image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
        similarity = (100.0 * image_features_val @ text_features_mi.T).T 
        ut.calc_similarity_diff(f'MI_gt{num_clip}_flickr','gender', queries, protected_attribute , {0: 'Female', 1:'Male'}, similarity)
        ut.run_anova(queries, all_labels_gender_test, similarity, f'MI_gt{num_clip}_flickr', skip_att = 2)
        ut.run_skew(queries, all_labels_gender_test, similarity, f'MI_gt{num_clip}_flickr', [10, 20, 30],skip_attr = 2)
        ut.run_retrieval_metric(queries, all_labels_gender_test, similarity, f'MI_gt{num_clip}_flickr', [10, 20, 30],skip_attr = 2)
        

In [None]:
print("======== Running MI G.T recall on the model ============== ")

flat_captions = all_labels_captions_test.flatten()
for attr in ['gender']:
    num_clip_s = [400, 256]
    mis = MI_GT[attr]
    for num_clip in num_clip_s:
#         import time
        start = time.time()
        
        similarity = []
        
        
        image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
        
        for cap in flat_captions:
            text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)


            with torch.no_grad():
                text_features = model.encode_text(text_inputs)#.float()
            text_features /= text_features.norm(dim=-1, keepdim=True)
            text_features = text_features.cpu().numpy().astype(np.float64)
            text_features_mi =text_features[:, mis[:num_clip]]
            similarity.append((100.0 * image_features_val @ text_features_mi.T).T)
        similarity = np.asarray(similarity).squeeze()#.reshape((77535,15507 ))
        ut.calculate_recall(similarity, f"MI_gt{num_clip}_flickr")
        end = time.time()
        print(end - start)

In [None]:
print("======== Running retrieval MI INF on the model ============== ")
for attr in ['gender']:
    
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in queries]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features = text_features.cpu().numpy().astype(np.float64)
    num_clip_s = [400, 256]
    mis = MI_inferred[attr]
    for num_clip in num_clip_s:
        print(f"..... {num_clip}.........")
        text_features_mi =text_features[:, mis[:num_clip]]
        image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
        similarity = (100.0 * image_features_val @ text_features_mi.T).T 
        ut.calc_similarity_diff(f'MI_inf{num_clip}_flickr', 'gender',queries, protected_attribute , {0: 'Female', 1:'Male'}, similarity)
        ut.run_anova(queries, all_labels_gender_test, similarity, f'MI_inf{num_clip}_flickr', skip_att = 2)
        ut.run_skew(queries, all_labels_gender_test, similarity, f'MI_inf{num_clip}_flickr',[10, 20, 30],skip_attr = 2)
        ut.run_retrieval_metric(queries, all_labels_gender_test, similarity, f'MI_inf{num_clip}_flickr',[10, 20, 30],skip_attr = 2)
        

In [25]:
print("======== Running MI INF recall on the model ============== ")

flat_captions = all_labels_captions_test.flatten()
for attr in ['gender']:
    num_clip_s = [400, 256]
    mis = MI_inferred[attr]
    for num_clip in num_clip_s:
#         import time
        start = time.time()
        similarity = []
        image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
        for cap in flat_captions:
            text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to(device)

            with torch.no_grad():
                text_features = model.encode_text(text_inputs)#.float()
            text_features /= text_features.norm(dim=-1, keepdim=True)
            text_features = text_features.cpu().numpy().astype(np.float64)
            text_features_mi =text_features[:, mis[:num_clip]]
            similarity.append((100.0 * image_features_val @ text_features_mi.T).T)
        similarity = np.asarray(similarity).squeeze()#.reshape((77535,15507 ))
        ut.calculate_recall(similarity, f"MI_inf{num_clip}_flickr")
        end = time.time()
        print(end - start)

   mean_top_1  mean_top_5  mean_top_10
0       0.229       0.422        0.514
1767.8580169677734
   mean_top_1  mean_top_5  mean_top_10
0       0.149       0.304        0.384
1111.21799826622


In [None]:
======== Running MI INF recall on the model ============== 
   mean_top_1  mean_top_5  mean_top_10
0       0.229       0.421        0.514
2881.425798892975
   mean_top_1  mean_top_5  mean_top_10
0       0.149       0.304        0.384
2773.1723685264587

In [None]:
print("======== Running CLF MI G.T on the model ============== ")

for attr in ['gender']:
    num_clip_s = [400, 256]
    mis = MI_GT[attr]
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    for num_clip in num_clip_s:
        print(f"----------- {num_clip}--------------")
        temp = np.zeros((len(classification_tasks[attr]),2))
    
        for cc, task in enumerate(classification_tasks[attr]):
            text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task]).to(device)
            with torch.no_grad():
                text_features = model.encode_text(text_inputs)
            text_features_mi =text_features.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
            image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
            similarity = softmax(100.0 * np.matmul(image_features_val, np.transpose(text_features_mi)),axis=1)

            predictions = np.argmax(similarity,axis=1)
            for ell in range(2):
    #             print(ell)
                temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test==ell]),2)
        columns= ['Female', 'Male']
        temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
        if attr == 'gender':	  
            temp['Disparity'] = temp['Male'] - temp['Female']
        elif attr == 'race':
            temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
        temp.to_csv(f"../results_csv/{attr}_flickr_clf_MI_gt{num_clip}.csv")
        print(temp)

In [None]:
print("======== Running CLF MI inf on the model ============== ")
for attr in ['gender']:
    num_clip_s = [400, 256]
    mis = MI_inferred[attr]
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    for num_clip in num_clip_s:
        print(f"----------- {num_clip}--------------")
        temp = np.zeros((len(classification_tasks[attr]),2))
    
        for cc, task in enumerate(classification_tasks[attr]):
            text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task]).to(device)
            with torch.no_grad():
                text_features = model.encode_text(text_inputs)
            text_features_mi =text_features.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
            image_features_val = all_features_test.cpu().numpy().astype(np.float64)[:, mis[:num_clip]]
            similarity = softmax(100.0 * np.matmul(image_features_val, np.transpose(text_features_mi)),axis=1)

            predictions = np.argmax(similarity,axis=1)
            for ell in range(2):
    #             print(ell)
                temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test==ell]),2)
        columns= ['Female', 'Male']
        temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
        if attr == 'gender': 
            temp['Disparity'] = temp['Male'] - temp['Female']
        elif attr == 'race':
            temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
        temp.to_csv(f"../results_csv/{attr}_flickr_clf_MI_inf{num_clip}.csv")
        print(temp)

# Prompt method https://arxiv.org/abs/2203.11933

In [None]:
import sys
sys.path.insert(1, '../debias-vision-lang')
import debias_clip

In [None]:
deb_clip_model, deb_preprocess = debias_clip.load("ViT-B/16-gender", device=device)
deb_clip_model.eval()
flickrData = fl.MyFlickr30k('../../flicker30k-images/flickr30k-images', '../../flicker30k-captions/results_20130124.token',None, transform=deb_preprocess)
train_size = int(0.5 * len(flickrData))
test_size = len(flickrData) - train_size
torch.manual_seed(0)
set_seed(0)
flickrData_train, flickrData_test = torch.utils.data.random_split(flickrData, [train_size, test_size])
all_features_test_deb, all_labels_captions_test_deb, all_labels_gender_test_deb = ut.get_features_flickr(flickrData_test, deb_clip_model, device)
all_features_test_deb /= all_features_test_deb.norm(dim=-1, keepdim=True)
flat_captions = all_labels_captions_test_deb.flatten()
protected_attribute = {'gender':all_labels_gender_test_deb}


In [None]:
print("Testing bias in debias model")

text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in queries]).to("cpu")
deb_clip_model_cpu, deb_preprocess = debias_clip.load("ViT-B/16-gender", device='cpu')
deb_clip_model_cpu.eval()
with torch.no_grad():
#     deb_clip_model = deb_clip_model.to("cpu") # didn't work! 
    text_features_deb = deb_clip_model_cpu.encode_text(text_inputs).to(torch.float16)
    text_features_deb = text_features_deb.to(device)
print(text_inputs.shape, text_features_deb.shape)
text_features_deb /= text_features_deb.norm(dim=-1, keepdim=True)
similarity_deb = (100.0 * all_features_test_deb @ text_features_deb.T).cpu().numpy().astype(np.float64).T
ut.calc_similarity_diff(f'prompt_flickr','gender', queries,  protected_attribute, {0: 'Female', 1:'Male'}, similarity_deb)
ut.run_anova(queries, all_labels_gender_test_deb, similarity_deb, f'prompt_flickr', skip_att = 2)
ut.run_skew(queries, all_labels_gender_test_deb, similarity_deb, f'prompt_flickr',[10, 20, 30],skip_attr = 2)
ut.run_retrieval_metric(queries, all_labels_gender_test_deb, similarity_deb, f'prompt_flickr',[10, 20, 30],skip_attr = 2)


In [32]:
print("======== Running Prompt recall on the model ============== ")
import time
for attr in ['gender']:
    start = time.time()

    similarity = []

    deb_clip_model_cpu, deb_preprocess = debias_clip.load("ViT-B/16-gender", device='cpu')
    deb_clip_model_cpu.eval()
    for cap in tqdm(flat_captions):
        text_inputs = clip.tokenize(f"a photo of {cap}", truncate = True).to("cpu")

        with torch.no_grad():
            text_features = deb_clip_model_cpu.encode_text(text_inputs).to(torch.float16)
            text_features = text_features.to(device)
        text_features /= text_features.norm(dim=-1, keepdim=True)
      
      
        similarity.append((100.0 * all_features_test_deb @ text_features.T).cpu().numpy().astype(np.float64).T)
    similarity = np.asarray(similarity).squeeze()
    ut.calculate_recall(similarity, f"prompt_flickr")
    end = time.time()
    print(end - start)

Installing pretrained embedings
 best_ndkl_oai-clip-vit-b-16_neptune_run_OXVLB-317_model_e4_step_5334_embeddings.pt...


100%|█████████████████████████████████████| 4.73k/4.73k [00:00<00:00, 10.2MiB/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 77535/77535 [20:47<00:00, 62.14it/s]


   mean_top_1  mean_top_5  mean_top_10
0       0.353       0.593         0.69
1327.7051665782928


In [42]:
mean_top_1  mean_top_5  mean_top_10
0       0.352       0.594         0.69

(77535, 15507)

In [None]:
0.29        0.52         0.61

In [None]:
0.353       0.594         0.69

In [None]:
for attr in ['gender']:
    print(f'--- Evaluation of zero-shot classification w.r.t. {attr}  -------------------------')
    print('Numbers are the mean prediction rate for the first word when classifying into the two words')
    temp = np.zeros((len(classification_tasks[attr]),2))
    deb_clip_model_cpu, deb_preprocess = debias_clip.load("ViT-B/16-gender", device='cpu')
    for cc, task in enumerate(classification_tasks[attr]):
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in task])#.to(device)
        with torch.no_grad():
#     deb_clip_model = deb_clip_model.to("cpu") # didn't work! 
            text_features_deb = deb_clip_model_cpu.encode_text(text_inputs).to(torch.float16)
            text_features_deb = text_features_deb.to(device)
        text_features_deb /= text_features_deb.norm(dim=-1, keepdim=True)
        similarity = (100.0 * all_features_test_deb @ text_features_deb.T).softmax(dim=-1).cpu().numpy().astype(np.float64)
        predictions = np.argmax(similarity,axis=1)
        for ell in range(2):
#             print(ell)
            temp[cc, ell] = 1 - np.around(np.mean(predictions[all_labels_gender_test_deb==ell]),2)
    columns= ['Female', 'Male']
    temp = pd.DataFrame(temp, columns=columns, index=classification_tasks[attr])
    if attr == 'gender':	  
        temp['Disparity'] = temp['Male'] - temp['Female']
    elif attr == 'race':
        temp['Disparity'] = temp.max(axis = 1) - temp.min(axis = 1)
    temp.to_csv(f"../results_csv/{attr}_flickr_clf_prompt.csv")
    print(temp)

# Explicit gender

In [None]:
print("----------------- Run gendered ------------------")
word_list_gendered = []
for word in queries:
    word_list_gendered.append(f'male {word}')
    word_list_gendered.append(f'female {word}')
      
text_inputs = torch.cat([clip.tokenize(f"a photo of a {word}") for word in word_list_gendered]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_inputs)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity_gendered = (100.0 * all_features_test @ text_features.T).cpu().numpy().astype(np.float64).T
ut.run_skew_mixed(queries, similarity_gendered, all_labels_gender_test, 'gen_bln_flickr', [10,20,30], skip_attr = 2)
ut.run_retrieval_metric_mixed(queries, similarity_gendered, all_labels_gender_test, 'gen_bln_flickr', [10,20,30], skip_attr = 2)

In [39]:
np.unique(all_labels_gender_test)

array([0, 1, 2])

In [47]:
len(2)

TypeError: object of type 'int' has no len()

In [48]:
all_labels_gender_test - 2

array([-1, -1, -1, ...,  0, -1, -2])