# Classification with CLIP by only text or image through linear probing (https://github.com/openai/CLIP)



In [1]:
# imports
import torch
import numpy as np

SEED = 42
torch.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f712666eb90>

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
import clip
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
model, preprocess = clip.load("ViT-B/32", jit=True, device=device)

  if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):


# Load Dataset

In [5]:
import pandas as pd
import os

from torch.utils.data import Dataset, DataLoader
from PIL import Image

In [6]:
class FakedditDataset(Dataset):
    """Subset of fake news dataset from """

    def __init__(self, dataset, root_dir, image_preprocess=None):
        """
        Args:
            dataset (string): Path to the csv file or a pandas DF
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        if type(dataset) is str:
            self.dataset = pd.read_csv(dataset)
        else:
            self.dataset = dataset
        self.root_dir = root_dir
        self.image_preprocess = image_preprocess

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        text = self.dataset.iloc[idx, 0]
        img_name = os.path.join(self.root_dir, f"{self.dataset.iloc[idx, 1]}.jpg")
        image = Image.open(img_name)
        if self.image_preprocess:
            image = self.image_preprocess(image.convert("RGB"))
            
        label = torch.zeros(6)
        label[self.dataset.iloc[idx, 2]] = 1
        
        return image, text, label

In [7]:
batch_size = 32

trainset = FakedditDataset('train_clean.csv', 'data', image_preprocess=preprocess)
testset = FakedditDataset('test_clean.csv', 'data', image_preprocess=preprocess)

# trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
# testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=2)

# Training (Logistic Regression)
based on code from <https://github.com/openai/CLIP>

In [9]:
from tqdm import tqdm

def get_features(dataset):
    all_image_features = []
    all_text_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, texts, labels in tqdm(DataLoader(dataset, batch_size=100)):
            image_input = torch.tensor(np.stack(images)).to(device)
            text_tokens = clip.tokenize(texts, truncate=True).to(device) # truncate: some titles are longer than 77, but I think there is more than enough context in 77 words
            labels = labels.float().to(device)
            labels = torch.argmax(labels, dim=1)
            
            image_features = model.encode_image(image_input)
            all_image_features.append(image_features)
            
            text_features = model.encode_text(text_tokens)
            all_text_features.append(text_features)
            
            all_labels.append(labels)

    return torch.cat(all_image_features).cpu().numpy(), torch.cat(all_text_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

train_image_features, train_text_features, train_labels = get_features(trainset)
test_image_features, test_text_features, test_labels = get_features(testset)

100%|██████████| 240/240 [03:13<00:00,  1.24it/s]
100%|██████████| 80/80 [01:03<00:00,  1.27it/s]


In [10]:
from sklearn.linear_model import LogisticRegression

## CLIP Linear Probing with Image

In [16]:
image_classifier = LogisticRegression(random_state=42, C=0.316, max_iter=1000, verbose=1)
image_classifier.fit(train_image_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3078     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.29700D+04    |proj g|=  3.38006D+04

At iterate   50    f=  1.18839D+04    |proj g|=  9.98757D+02

At iterate  100    f=  1.16059D+04    |proj g|=  8.77270D+01

At iterate  150    f=  1.15723D+04    |proj g|=  3.08921D+01

At iterate  200    f=  1.15639D+04    |proj g|=  1.32429D+01

At iterate  250    f=  1.15611D+04    |proj g|=  2.02317D+01

At iterate  300    f=  1.15586D+04    |proj g|=  8.57007D+00

At iterate  350    f=  1.15570D+04    |proj g|=  1.48593D+01

At iterate  400    f=  1.15556D+04    |proj g|=  1.48559D+01

At iterate  450    f=  1.15546D+04    |proj g|=  3.76360D+00

At iterate  500    f=  1.15542D+04    |proj g|=  1.71134D+01

At iterate  550    f=  1.15540D+04    |proj g|=  1.73340D+00

At iterate  600    f=  1.15540D+04    |proj g|=  1.18047D+00

At iterate  650    f=  1.1

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.1s finished


## CLIP Linear Probing with Text

In [15]:
text_classifier = LogisticRegression(random_state=42, C=0.316, max_iter=1000, verbose=1)
text_classifier.fit(train_text_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3078     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.29700D+04    |proj g|=  2.53215D+04

At iterate   50    f=  1.55657D+04    |proj g|=  1.59561D+03

At iterate  100    f=  1.49454D+04    |proj g|=  3.57014D+02

At iterate  150    f=  1.48408D+04    |proj g|=  5.95925D+01

At iterate  200    f=  1.48180D+04    |proj g|=  2.83317D+01

At iterate  250    f=  1.48117D+04    |proj g|=  2.99108D+01

At iterate  300    f=  1.48084D+04    |proj g|=  1.90872D+01

At iterate  350    f=  1.48063D+04    |proj g|=  1.40376D+01

At iterate  400    f=  1.48055D+04    |proj g|=  3.38868D+01

At iterate  450    f=  1.48051D+04    |proj g|=  5.22364D+00

At iterate  500    f=  1.48048D+04    |proj g|=  9.55954D+00

At iterate  550    f=  1.48046D+04    |proj g|=  4.11585D+00

At iterate  600    f=  1.48045D+04    |proj g|=  5.90206D+00

At iterate  650    f=  1.4

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.0s finished


# Evaluation

In [17]:
preds = image_classifier.predict(test_image_features)
accuracy = np.mean((test_labels == preds).astype(np.float64)) * 100.
print(f"Accuracy = {accuracy:.3f}%")

Accuracy = 79.800%


In [18]:
preds = text_classifier.predict(test_image_features)
accuracy = np.mean((test_labels == preds).astype(np.float64)) * 100.
print(f"Accuracy = {accuracy:.3f}%")

Accuracy = 58.274%
