In [1]:
import sys

import clip
import numpy as np
import pandas as pd
import torch
from sklearn import metrics

sys.path.append('/Users/hanselblanco/Documents/4to/ML/project/bias-project-ML')
from clip_execution import run_clip

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

In [5]:
model.load_state_dict(torch.load("/Users/hanselblanco/Documents/4to/ML/project/bias-project-ML/debiased_clip/finetuning/pytorch/best_model/best_model_1.pt"))

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [39]:
TP = 'tp'
FP = 'fp'
FN = 'fn'
MALE = 'male'
FEMALE = 'female'
GENDER_LABELS = [MALE, FEMALE]
GENDER_TKNS = ['This is a person of' + gender_label + 'gender.' for gender_label in GENDER_LABELS ]
TEST_DF = pd.read_json('test_data_df.json')

In [None]:
df = run_clip(['gender'], [GENDER_LABELS], [GENDER_TKNS], TEST_DF, model)

In [41]:
df.sample(20)

Unnamed: 0,filename,filepath,age,gender,race,predicted_gender
3932,45_1_0_20170109221158465.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,45,female,white,female
1843,9_0_0_20170110215848132.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,9,male,white,male
4648,50_0_3_20170119154108905.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,50,male,indian,female
3698,27_0_1_20170113000907378.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,27,male,black,female
1897,9_1_0_20170109202813775.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,9,female,white,female
28,25_1_4_20170103230304689.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,25,female,other,male
3035,9_0_0_20170110221716630.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,9,male,white,male
752,51_1_1_20170112213230359.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,51,female,black,female
3510,27_0_0_20170117175613066.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,27,male,white,male
3815,25_1_3_20170119171949009.jpg.chip.jpg,/Users/hanselblanco/Documents/4to/ML/project/b...,25,female,indian,male


In [42]:
k=0
for i in range(len(df)):
    if(df['gender'][i]==df['predicted_gender'][i]): k+=1
k

3710

In [43]:
y_true=df['gender'].tolist()
y_pred=df['predicted_gender'].tolist()

In [44]:
male_total=df['gender'].value_counts()[MALE]
female_total= df['gender'].value_counts()[FEMALE]

#### Balanced Accuracy

In [45]:
macro_accuracy= metrics.balanced_accuracy_score(y_true, y_pred)
macro_accuracy

0.7808705817906364

#### F1 score (macro)

F1 score can be interpreted as a measure of overall model performance from 0 to 1, where 1 is the best. To be more specific, F1 score can be interpreted as the model’s balanced ability to both capture positive cases (recall) and be accurate with the cases it does capture (precision).

the model’s ability to both capture positive cases and be accurate with the cases it does capture

In [46]:
metrics.f1_score(y_true, y_pred, labels = GENDER_LABELS, average='macro')

0.7813977896448396

#### Confusion Matrix

In [47]:

confusion_matrix= metrics.confusion_matrix(y_true, y_pred, labels = GENDER_LABELS)
# metrics.plot_confusion_matrix(confusion_matrix, labels=labels)

#### Computing FP TP FN

In [48]:
sex_rates = {'male':{'tp': 0, 'fp': 0, 'fn': 0}, 'female':{'tp': 0, 'fp': 0, 'fn': 0}}
totals = {'male': 0, 'female': 0}

In [49]:
false_positives = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
false_negatives = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
true_positives = np.diag(confusion_matrix)
true_negatives = confusion_matrix.sum() - (false_positives + false_negatives + true_positives)


In [50]:
# Filling dictionaries
for i in range (len(GENDER_LABELS)):
    sex_rates[GENDER_LABELS[i]][TP]= true_positives[i]
    sex_rates[GENDER_LABELS[i]][FP]= false_positives[i]
    sex_rates[GENDER_LABELS[i]][FN]= false_negatives[i]

#### Selection rate

In [51]:
males_sr = sex_rates[MALE][TP]/ male_total
females_sr = sex_rates[FEMALE][TP]/ female_total
males_sr, females_sr

(0.8179249091643117, 0.7438162544169611)

#### TPR

In [52]:
males_tpr = sex_rates[MALE][TP]/ (sex_rates[MALE][TP] + sex_rates[MALE][FN])
females_tpr = sex_rates[FEMALE][TP]/ (sex_rates[FEMALE][TP] + sex_rates[FEMALE][FN])

males_tpr, females_tpr

(0.8179249091643117, 0.7438162544169611)

#### FPR

In [53]:
males_fpr= sex_rates[MALE][FP]/(sex_rates[MALE][FP]+ sex_rates[MALE][FN])
females_fpr= sex_rates[FEMALE][FP]/(sex_rates[FEMALE][FP]+ sex_rates[FEMALE][FN])

males_fpr, females_fpr

(0.562560620756547, 0.43743937924345294)

### `Fairness Metrics`

#### Equalized Odds

In [60]:
if abs(males_tpr - females_tpr) < 0.15:
    print('Equalized odds')
else:
    print('Not equalized odds')
print(abs(males_tpr - females_tpr))

Equalized odds
0.07410865474735062


In [61]:
if abs(males_fpr - females_fpr) < 0.15:
    print('Equalized odds')
else:
    print('Not equalized odds')
print(abs(males_fpr - females_fpr))

Equalized odds
0.12512124151309406


#### Disparate impact

In [62]:
# disparate impact ratio = underprivileged group SR / privileged group SR
disp_impact = females_sr / males_sr
if disp_impact < 0.8:
    print('Disparate impact present in female group / male group')
else:
    print('No disparate impact present in female group / male group')
disp_impact

No disparate impact present in female group / male group


0.9093943051287328