# Evaluation of CLIP models (https://github.com/openai/CLIP)

In [5]:
# imports
import torch
import numpy as np

SEED = 42
torch.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f462760aa70>

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [7]:
import clip
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [8]:
model, preprocess = clip.load("ViT-B/32", jit=True, device=device)
del model

  if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):


# Load Dataset

In [9]:
import pandas as pd
import os

from torch.utils.data import Dataset, DataLoader
from PIL import Image

In [10]:
class FakedditDataset(Dataset):
    """Subset of fake news dataset from """

    def __init__(self, dataset, root_dir, image_preprocess=None):
        """
        Args:
            dataset (string): Path to the csv file or a pandas DF
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        if type(dataset) is str:
            self.dataset = pd.read_csv(dataset)
        else:
            self.dataset = dataset
        self.root_dir = root_dir
        self.image_preprocess = image_preprocess

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        text = self.dataset.iloc[idx, 0]
        img_name = os.path.join(self.root_dir, f"{self.dataset.iloc[idx, 1]}.jpg")
        image = Image.open(img_name)
        if self.image_preprocess:
            image = self.image_preprocess(image.convert("RGB"))
            
        label = torch.zeros(6)
        label[self.dataset.iloc[idx, 2]] = 1
        
        return image, text, label

In [11]:
batch_size = 32

# trainset = FakedditDataset('../train_clean.csv', '../data', image_preprocess=preprocess)
testset = FakedditDataset('../test_clean.csv', '../data', image_preprocess=preprocess)

# trainloader = DataLoader(trainset, batch_size=batch_size, num_workers=2)
testloader = DataLoader(testset, batch_size=batch_size, num_workers=2)

# Model Definitions

In [12]:
import torch
import torch.nn as nn

In [13]:
class CLIPClassifier(nn.Module):
    def __init__(self, device='cpu') -> None:
        super().__init__()
        self.device = device
        
        self.clip_layer, _ = clip.load("ViT-B/32", jit=True, device=device) # Changed JIT to True for just inference
        # output of clip is 512
        # cat image and text for 1024
        self.fc1 = nn.Linear(1024, 512, device=device)
        self.fc2 = nn.Linear(512, 128, device=device)
        self.fc3 = nn.Linear(128, 6, device=device)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, image, text):
        image_features = self.clip_layer.encode_image(image).float()
        text_features = self.clip_layer.encode_text(text).float()
        features = torch.cat((image_features, text_features), dim=1)

        x = self.relu(self.fc1(features))
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    
clipclassifier = CLIPClassifier(device=device)

FULL_LOCATION = '../models/clipclassifier/1/'
MODEL_PATH = os.path.join(FULL_LOCATION, f'clipclassifier.pth')

checkpoint = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
clipclassifier.load_state_dict(checkpoint['model_state_dict'])
clipclassifier.eval()

  if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):


CLIPClassifier(
  (clip_layer): RecursiveScriptModule(
    original_name=Multimodal
    (visual): RecursiveScriptModule(
      original_name=VisualTransformer
      (conv1): RecursiveScriptModule(original_name=Conv2d)
      (ln_pre): RecursiveScriptModule(original_name=LayerNorm)
      (transformer): RecursiveScriptModule(
        original_name=Transformer
        (resblocks): RecursiveScriptModule(
          original_name=Sequential
          (0): RecursiveScriptModule(
            original_name=ResidualAttentionBlock
            (attn): RecursiveScriptModule(
              original_name=MultiheadAttention
              (out_proj): RecursiveScriptModule(original_name=_LinearWithBias)
            )
            (ln_1): RecursiveScriptModule(original_name=LayerNorm)
            (mlp): RecursiveScriptModule(
              original_name=Sequential
              (c_fc): RecursiveScriptModule(original_name=Linear)
              (gelu): RecursiveScriptModule(original_name=QuickGELU)
         

In [14]:
class CLIPImageClassifier(nn.Module):
    def __init__(self, device='cpu') -> None:
        super().__init__()
        self.device = device
        
        self.clip_layer, _ = clip.load("ViT-B/32", jit=True, device=device) # Changed JIT to True for just inference
        # output of clip is 512
        # cat image and text for 1024
        self.fc1 = nn.Linear(512, 512, device=device)
        self.fc2 = nn.Linear(512, 128, device=device)
        self.fc3 = nn.Linear(128, 6, device=device)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, image):
        image_features = self.clip_layer.encode_image(image).float()

        x = self.relu(self.fc1(image_features))
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

clipimageclassifier = CLIPImageClassifier(device=device)

FULL_LOCATION = '../models/clipimageclassifier/1/'
MODEL_PATH = os.path.join(FULL_LOCATION, f'clipimageclassifier.pth')

checkpoint = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
clipimageclassifier.load_state_dict(checkpoint['model_state_dict'])
clipimageclassifier.eval()

CLIPImageClassifier(
  (clip_layer): RecursiveScriptModule(
    original_name=Multimodal
    (visual): RecursiveScriptModule(
      original_name=VisualTransformer
      (conv1): RecursiveScriptModule(original_name=Conv2d)
      (ln_pre): RecursiveScriptModule(original_name=LayerNorm)
      (transformer): RecursiveScriptModule(
        original_name=Transformer
        (resblocks): RecursiveScriptModule(
          original_name=Sequential
          (0): RecursiveScriptModule(
            original_name=ResidualAttentionBlock
            (attn): RecursiveScriptModule(
              original_name=MultiheadAttention
              (out_proj): RecursiveScriptModule(original_name=_LinearWithBias)
            )
            (ln_1): RecursiveScriptModule(original_name=LayerNorm)
            (mlp): RecursiveScriptModule(
              original_name=Sequential
              (c_fc): RecursiveScriptModule(original_name=Linear)
              (gelu): RecursiveScriptModule(original_name=QuickGELU)
    

In [15]:
class CLIPTextClassifier(nn.Module):
    def __init__(self, device='cpu') -> None:
        super().__init__()
        self.device = device
        
        self.clip_layer, _ = clip.load("ViT-B/32", jit=True, device=device) # Changed JIT to True for just inference
        # output of clip is 512
        # cat image and text for 1024
        self.fc1 = nn.Linear(512, 512, device=device)
        self.fc2 = nn.Linear(512, 128, device=device)
        self.fc3 = nn.Linear(128, 6, device=device)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, text): # remove the image portion
        text_features = self.clip_layer.encode_text(text).float()

        x = self.relu(self.fc1(text_features))
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

cliptextclassifier = CLIPTextClassifier(device=device)

FULL_LOCATION = '../models/cliptextclassifier/1/'
MODEL_PATH = os.path.join(FULL_LOCATION, f'cliptextclassifier.pth')

checkpoint = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
cliptextclassifier.load_state_dict(checkpoint['model_state_dict'])
cliptextclassifier.eval()

CLIPTextClassifier(
  (clip_layer): RecursiveScriptModule(
    original_name=Multimodal
    (visual): RecursiveScriptModule(
      original_name=VisualTransformer
      (conv1): RecursiveScriptModule(original_name=Conv2d)
      (ln_pre): RecursiveScriptModule(original_name=LayerNorm)
      (transformer): RecursiveScriptModule(
        original_name=Transformer
        (resblocks): RecursiveScriptModule(
          original_name=Sequential
          (0): RecursiveScriptModule(
            original_name=ResidualAttentionBlock
            (attn): RecursiveScriptModule(
              original_name=MultiheadAttention
              (out_proj): RecursiveScriptModule(original_name=_LinearWithBias)
            )
            (ln_1): RecursiveScriptModule(original_name=LayerNorm)
            (mlp): RecursiveScriptModule(
              original_name=Sequential
              (c_fc): RecursiveScriptModule(original_name=Linear)
              (gelu): RecursiveScriptModule(original_name=QuickGELU)
     

In [16]:
import pickle

with open('../models/linearimage/1/image_classifier.pkl', 'rb') as f:
    image_classifier = pickle.load(f)

with open('../models/lineartext/1/text_classifier.pkl', 'rb') as f:
    text_classifier = pickle.load(f)
    
with open('../models/linearconcat/1/concat_classifier.pkl', 'rb') as f:
    concat_classifier = pickle.load(f)

# Evaluation Table Data Collection

In [17]:
for images, texts, labels in testloader:
    image_input = torch.tensor(np.stack(images)).to(device)
    text_tokens = clip.tokenize(texts, truncate=True).to(device) # truncate: some titles are longer than 77, but I think there is more than enough context in 77 words
    labels = labels.float().to(device)
    
    preds = clipclassifier(image_input, text_tokens)
    preds_label = torch.argmax(preds, dim=1)
    print(preds_label)
    break


tensor([2, 2, 4, 4, 0, 0, 2, 4, 0, 0, 0, 0, 4, 0, 0, 4, 2, 0, 0, 0, 0, 4, 0, 4,
        4, 4, 0, 4, 0, 4, 4, 4], device='cuda:0')


In [None]:
clip_model, preprocess = clip.load("ViT-B/32", jit=True, device=device)
clip_model.eval()

In [27]:
from tqdm import tqdm

clip_predictions = []
clip_text_predictions = []
clip_image_predictions = []

linear_concat_predictions = []
linear_text_predictions = []
linear_image_predictions = []

i = 0
for image, text, label in tqdm(testset):
    
    # print(torch.argmax(labels, dim=0))
    image_input = torch.tensor(np.stack(image.unsqueeze(0))).to(device)
    text_tokens = clip.tokenize([text], truncate=True).to(device)
    label = label.float().unsqueeze(0).to(device)
    
    preds = clipclassifier(image_input, text_tokens)
    preds_label = torch.argmax(preds, dim=1)
    clip_predictions.append(preds_label[0].item())
    
    preds = cliptextclassifier(text_tokens)
    preds_label = torch.argmax(preds, dim=1)
    clip_text_predictions.append(preds_label[0].item())
    
    preds = clipimageclassifier(image_input)
    preds_label = torch.argmax(preds, dim=1)
    clip_image_predictions.append(preds_label[0].item())
    
    image_features = clip_model.encode_image(image_input)
    text_features = clip_model.encode_text(text_tokens)
    
    linear_concat_predictions.append(concat_classifier.predict(torch.cat([image_features, text_features], dim=1).cpu().detach().numpy())[0])
    
    linear_text_predictions.append(text_classifier.predict(image_features.cpu().detach().numpy())[0])
    linear_image_predictions.append(image_classifier.predict(text_features.cpu().detach().numpy())[0])

100%|██████████| 7995/7995 [08:30<00:00, 15.67it/s]


In [28]:
df = pd.read_csv('../test_clean.csv')

In [29]:
df['clip'] = clip_predictions
df['clip_text'] = clip_text_predictions
df['clip_image'] = clip_image_predictions

df['linear_concat'] = linear_concat_predictions
df['linear_text'] = linear_text_predictions
df['linear_image'] = linear_image_predictions

In [30]:
df.to_csv('clip_predictions2.csv', index=False)

In [None]:
df = pd.DataFrame({'Text Classifier Correct':[]})