# Installing CLIP

In [53]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!unzip gdrive/MyDrive/utkface.zip

Archive:  gdrive/MyDrive/utkface.zip
replace __MACOSX/._utkface? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import os
import random as rd

import pandas as pd
from sklearn.model_selection import StratifiedKFold

RACE_MAPPER = {0:'white', 1:'black',2: 'asian',3: 'indian', 4:'other'}
GENDER_MAPPER = {0:'male',1:'female'}


def data_selection(ds_path: str = 'utkface/', k: int = 5):
  
  df = load_dataset(ds_path)
  
  print(df)

  df = map_values(df)

  print(df)
  
  # Stratified KFold
  skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=1)

  X = df['filepath']
  y = df['gender'] + df['race']
  r = rd.randint(0, k - 1)
  train_idx = []
  test_idx = []

  for i, (train_index, test_index) in enumerate(skf.split(X, y)):
      if i == r:
          train_idx=train_index
          test_idx=test_index
          break

  train_data = df.iloc[train_idx].reset_index(drop=True)
  test_data = df.iloc[test_idx].reset_index(drop=True)

  print(train_data)
  print(test_data)

  return train_data, test_data


def load_dataset(ds_path: str):
  # Loading filenames
  filenames = os.listdir(ds_path)
  
  try:
      filenames.remove('.DS_Store')
  except:
      pass
  
  # Building the dataframe
  df = pd.DataFrame(filenames, columns = ['filename'] )
  df['filepath'] = df.filename.apply(lambda x: ds_path + x )
  df['age'] = df.filename.apply(lambda x: int(x.split('_')[0]))
  df['gender'] = df.filename.apply(lambda x: int(x.split('_')[1]))
  df['race'] = df.filename.apply(lambda x: int(x.split('_')[-2]))
  
  return df

def map_values(df: pd.DataFrame):
  for i in range(len(df)):
      df['gender'][i]= GENDER_MAPPER[df['gender'][i]]
      df['race'][i]= RACE_MAPPER[df['race'][i]]
  return df
    



In [None]:
%pip install ftfy regex tqdm
%pip install git+https://github.com/openai/CLIP.git

In [None]:
from PIL import Image
import torch
from torch import nn, optim
import glob
import os
import pandas as pd
import json
import numpy as np
import clip
from torch.utils.data import Dataset, DataLoader, BatchSampler
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import random
from matplotlib.pyplot import imshow
import nltk, re, string, collections
from nltk.util import ngrams
import collections
from itertools import combinations

%matplotlib inline

BATCH_SIZE = 4
EPOCH = 30
EQ_ODDS_THRESHOLD = 0.15

# Preparing Model and Data

In [None]:
train_df, test_df = data_selection()
test_df.to_json(r'test_data_df.json')


## Splitting 20% for Validation

## Loading Pre-trained CLIP Model and Preprocessor

In [None]:
train_df_temp = train_df.sample(frac=0.8)
validation_df = train_df.drop(train_df_temp.index).reset_index(drop=True)
train_df = train_df_temp.reset_index(drop=True)

print(len(train_df))
train_df_males = train_df.loc[train_df['gender'] == 'male'].sample(1000)
train_df_females = train_df.loc[train_df['gender'] == 'female'].sample(1000)
train_df = pd.concat(train_df_males, train_df_females)

valid_df_males = validation_df.loc[validation_df['gender'] == 'male'].sample(200)
valid_df_females = validation_df.loc[validation_df['gender'] == 'female'].sample(200)
validation_df = pd.concat(valid_df_males, valid_df_females)

len(train_df), len(validation_df), len(test_df)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device=device, jit=False)

## UTKFaceDataset

In [None]:
class UTKFaceDataset(Dataset):
    def __init__(self, dataframe, preprocess):
        self.preprocess = preprocess
        self.filepath = dataframe["filepath"].tolist()
        self.filename = dataframe["filename"].tolist()
        self.gender = dataframe["gender"].tolist()
        self.race = dataframe["race"].tolist()
        self.age = dataframe["age"].tolist()
        self.preprocessed_cache = {}
        for path in self.filepath:
            self.preprocessed_cache[path] = self.preprocess(Image.open(path))

    def __len__(self):
        return len(self.filepath)

    def __getitem__(self, idx):
        filepath = self.filepath[idx]
        filename = self.filename[idx]
        gender = self.gender[idx]
        race = self.race[idx]
        age = self.age[idx]
        image = self.preprocessed_cache[filepath]
        return filepath, filename, gender, race, age, image

train_dataset = UTKFaceDataset(train_df, preprocess)
validation_dataset = UTKFaceDataset(validation_df, preprocess)
len(train_dataset), len(validation_dataset), train_dataset[0]

## BatchSampler

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size = BATCH_SIZE, shuffle=False)

In [None]:
for batch in train_dataloader:
    print(batch[2])
    break

# Training

In [None]:
attributes_queries = {}

race_labels = ['black', 'white', 'asian', 'indian', 'other']
for label in race_labels:
    attributes_queries[label] = 'A photo of a person of ' + label + ' race.'

gender_labels = ['male', 'female']
for label in gender_labels:
    attributes_queries[label] = 'A photo of a person of ' + label + ' gender.'

print(attributes_queries)

gender_texts = [attributes_queries[lbl] for lbl in gender_labels]
gender_texts = clip.tokenize(gender_texts).to(device)

In [None]:
#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

if device == "cpu":
    model.float()

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader)*EPOCH)

In [None]:
def update_eq_odds_rates(rates, labels, logits, real_values):
    for i in range(len(real_values)):
        true_value = real_values[i]
        pred_idx = torch.argmax(logits[i])
        pred_value = labels[pred_idx]
        
        if true_value == labels[pred_idx]:
            rates[true_value]['tp'] += 1
        else:
            rates[true_value]['fn'] += 1
            rates[pred_value]['fp'] += 1

In [None]:
best_te_loss = 1e5
best_ep = -1

# Equalized odds vars
best_te_bias = -1
tpr = lambda tp, fn: tp/ (tp + fn)
fpr = lambda fp, fn: fp/ (fp + fn)

for epoch in range(EPOCH):
    print(f"running epoch {epoch}, best test loss {best_te_loss} after epoch {best_ep}")
    step = 0
    tr_loss = 0
    model.train()
    pbar = tqdm(train_dataloader, leave=False)
    for batch in pbar:
        step += 1
        optimizer.zero_grad()

        images = batch[-1]
        
        images = images.to(device)
        logits_per_image, _ = model(images, gender_texts)
        # print(logits_per_image.shape)
        # print(logits_per_image)
         # torch.arange(BATCH_SIZE).to(device)
        ground_truth = torch.zeros((BATCH_SIZE, len(gender_labels))).to(device) # torch.arange(BATCH_SIZE).to(device)
        
        for i in range(BATCH_SIZE):
            truth_idx = gender_labels.index(batch[2][i])
            ground_truth[i, truth_idx] = 1

        total_loss = loss_img(logits_per_image, ground_truth)
        total_loss.backward()
        tr_loss += total_loss.item()
        if device == "cpu":
            optimizer.step()
            scheduler.step()
        else:
            convert_models_to_fp32(model)
            optimizer.step()
            scheduler.step()
            clip.model.convert_weights(model)
        pbar.set_description(f"train batchCE: {total_loss.item()}", refresh=True)
    tr_loss /= step
    
    step = 0
    te_loss = 0
    rates = {'male': {'tp': 0, 'fp': 0, 'fn': 0}, 
             'female': {'tp': 0, 'fp': 0, 'fn': 0} }
    with torch.no_grad():
        model.eval()
        val_pbar = tqdm(validation_dataloader, leave=False)
        for batch in val_pbar:
            step += 1
            images = batch[-1]
            
            images = images.to(device)
            
            logits_per_image, logits_per_text = model(images, gender_texts)
            ground_truth = torch.zeros((BATCH_SIZE, len(gender_labels))).to(device) # torch.arange(BATCH_SIZE).to(device)
            
            for i in range(BATCH_SIZE):
                truth_idx = gender_labels.index(batch[2][i])
                ground_truth[i, truth_idx] = 1

            total_loss = loss_img(logits_per_image, ground_truth)
            te_loss += total_loss.item()
            
            update_eq_odds_rates(rates, gender_labels, logits_per_image, batch[2])
            
            val_pbar.set_description(f"test batchCE: {total_loss.item()}", refresh=True)
        te_loss /= step
        
    # Equalized odds calculation
    tpr_values = {label : 0 for label in rates.keys()}
    fpr_values = {label : 0 for label in rates.keys()}
    for label in rates.keys():
        label_rates = rates[label]
        tpr_values[label] = tpr(label_rates['tp'], label_rates['fn'])
        fpr_values[label] = fpr(label_rates['fp'], label_rates['fn'])
    
    equalized_odds = True
    for pair in combinations(tpr_values.keys(), 2):
        first_label = pair[0]
        second_label = pair[1]
        te_bias = [abs(tpr_values[first_label] - tpr_values[second_label]), abs(fpr_values[first_label] - fpr_values[second_label])]
        if max(te_bias[0], te_bias[1]) >= EQ_ODDS_THRESHOLD:
            equalized_odds = False
            break
    
    if te_loss < best_te_loss and equalized_odds: # maximize accuracy with fairness threshold
        best_te_loss = te_loss
        best_ep = epoch
        best_te_bias = te_bias
        torch.save(model.state_dict(), "best_model.pt")
    print(f"epoch {epoch}, tr_loss {tr_loss}, te_loss {te_loss}, te_bias {te_bias}")

torch.save(model.state_dict(), "last_model.pt")