# Classification with CLIP by only text or image through linear probing (https://github.com/openai/CLIP)



In [1]:
# imports
import torch
import numpy as np

SEED = 42
torch.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f05f009cb50>

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
import clip
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
model, preprocess = clip.load("ViT-B/32", jit=True, device=device)

  if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):


# Load Dataset

In [5]:
import pandas as pd
import os

from torch.utils.data import Dataset, DataLoader
from PIL import Image

In [7]:
class FakedditDataset(Dataset):
    """Subset of fake news dataset from """

    def __init__(self, dataset, root_dir, image_preprocess=None):
        """
        Args:
            dataset (string): Path to the csv file or a pandas DF
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        if type(dataset) is str:
            self.dataset = pd.read_csv(dataset)
        else:
            self.dataset = dataset
        self.root_dir = root_dir
        self.image_preprocess = image_preprocess

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        text = self.dataset.iloc[idx, 0]
        img_name = os.path.join(self.root_dir, f"{self.dataset.iloc[idx, 1]}.jpg")
        image = Image.open(img_name)
        if self.image_preprocess:
            image = self.image_preprocess(image.convert("RGB"))
            
        label = torch.zeros(6)
        label[self.dataset.iloc[idx, 2]] = 1
        
        return image, text, label

In [8]:
batch_size = 32

trainset = FakedditDataset('train_clean.csv', 'data', image_preprocess=preprocess)
testset = FakedditDataset('test_clean.csv', 'data', image_preprocess=preprocess)

# trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
# testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=2)

# Training (Logistic Regression)
based on code from <https://github.com/openai/CLIP>

In [10]:
from tqdm import tqdm

def get_features(dataset):
    all_image_features = []
    all_text_features = []
    all_concat_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, texts, labels in tqdm(DataLoader(dataset, batch_size=100)):
            image_input = torch.tensor(np.stack(images)).to(device)
            text_tokens = clip.tokenize(texts, truncate=True).to(device) # truncate: some titles are longer than 77, but I think there is more than enough context in 77 words
            labels = labels.float().to(device)
            labels = torch.argmax(labels, dim=1)
            
            image_features = model.encode_image(image_input)
            all_image_features.append(image_features)
            
            text_features = model.encode_text(text_tokens)
            all_text_features.append(text_features)
            
            all_concat_features.append(torch.cat([image_features, text_features], dim=1))
            
            all_labels.append(labels)

    return torch.cat(all_image_features).cpu().numpy(), torch.cat(all_text_features).cpu().numpy(), torch.cat(all_concat_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

train_image_features, train_text_features, train_concat_features, train_labels = get_features(trainset)
test_image_features, test_text_features, test_concat_features, test_labels = get_features(testset)

100%|██████████| 240/240 [03:07<00:00,  1.28it/s]
100%|██████████| 80/80 [01:03<00:00,  1.27it/s]


In [11]:
from sklearn.linear_model import LogisticRegression

## CLIP Linear Probing with Image

In [12]:
image_classifier = LogisticRegression(random_state=42, C=0.316, max_iter=1000, verbose=1)
image_classifier.fit(train_image_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3078     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.29700D+04    |proj g|=  3.38006D+04

At iterate   50    f=  1.19017D+04    |proj g|=  7.50322D+02

At iterate  100    f=  1.16059D+04    |proj g|=  1.34348D+02

At iterate  150    f=  1.15707D+04    |proj g|=  3.71698D+01

At iterate  200    f=  1.15637D+04    |proj g|=  1.76459D+01

At iterate  250    f=  1.15608D+04    |proj g|=  2.15426D+01

At iterate  300    f=  1.15586D+04    |proj g|=  1.73510D+01

At iterate  350    f=  1.15569D+04    |proj g|=  8.60436D+00

At iterate  400    f=  1.15553D+04    |proj g|=  3.24305D+01

At iterate  450    f=  1.15545D+04    |proj g|=  7.38039D+00

At iterate  500    f=  1.15541D+04    |proj g|=  4.57593D+00

At iterate  550    f=  1.15539D+04    |proj g|=  3.40096D+00

At iterate  600    f=  1.15539D+04    |proj g|=  2.29648D+00

At iterate  650    f=  1.1

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.7s finished


## CLIP Linear Probing with Text

In [13]:
text_classifier = LogisticRegression(random_state=42, C=0.316, max_iter=1000, verbose=1)
text_classifier.fit(train_text_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3078     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.29700D+04    |proj g|=  2.53215D+04

At iterate   50    f=  1.55261D+04    |proj g|=  1.44825D+03

At iterate  100    f=  1.49454D+04    |proj g|=  1.85337D+02

At iterate  150    f=  1.48468D+04    |proj g|=  1.04991D+02

At iterate  200    f=  1.48219D+04    |proj g|=  3.70467D+01

At iterate  250    f=  1.48124D+04    |proj g|=  4.84347D+01

At iterate  300    f=  1.48088D+04    |proj g|=  7.50601D+00

At iterate  350    f=  1.48071D+04    |proj g|=  1.22280D+01

At iterate  400    f=  1.48058D+04    |proj g|=  1.81465D+01

At iterate  450    f=  1.48052D+04    |proj g|=  2.50352D+00

At iterate  500    f=  1.48048D+04    |proj g|=  6.19947D+00

At iterate  550    f=  1.48046D+04    |proj g|=  1.18652D+01

At iterate  600    f=  1.48045D+04    |proj g|=  8.95061D+00

At iterate  650    f=  1.4

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.3s finished


## CLIP Linear Probing with Both Image and Text (Concatenation)

In [14]:
concat_classifier = LogisticRegression(random_state=42, C=0.316, max_iter=1000, verbose=1)
concat_classifier.fit(train_concat_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         6150     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.29700D+04    |proj g|=  3.38006D+04

At iterate   50    f=  7.28675D+03    |proj g|=  1.31749D+02

At iterate  100    f=  6.41206D+03    |proj g|=  1.24925D+02

At iterate  150    f=  6.26398D+03    |proj g|=  3.31211D+02

At iterate  200    f=  6.22455D+03    |proj g|=  4.44137D+01

At iterate  250    f=  6.21265D+03    |proj g|=  5.19687D+01

At iterate  300    f=  6.20928D+03    |proj g|=  5.53868D+00

At iterate  350    f=  6.20811D+03    |proj g|=  8.03352D+00

At iterate  400    f=  6.20755D+03    |proj g|=  3.39356D+00

At iterate  450    f=  6.20689D+03    |proj g|=  1.89719D+01

At iterate  500    f=  6.20610D+03    |proj g|=  7.94628D+00

At iterate  550    f=  6.20535D+03    |proj g|=  4.20252D+00

At iterate  600    f=  6.20457D+03    |proj g|=  6.29080D+00

At iterate  650    f=  6.2

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min finished


In [67]:
import pickle
import os

os.makedirs('models/linearimage/1', exist_ok=True)

with open('models/linearimage/1/image_classifier.pkl', 'wb') as f:
    pickle.dump(image_classifier, f)

os.makedirs('models/lineartext/1', exist_ok=True)

with open('models/lineartext/1/text_classifier.pkl', 'wb') as f:
    pickle.dump(text_classifier, f)

os.makedirs('models/linearconcat/1', exist_ok=True)

with open('models/linearconcat/1/concat_classifier.pkl', 'wb') as f:
    pickle.dump(concat_classifier, f)

# Evaluation

In [68]:
# with open('models/linearimage/1/image_classifier.pkl', 'rb') as f:
#     image_classifier = pickle.load(f)

# with open('models/lineartext/1/text_classifier.pkl', 'rb') as f:
#     text_classifier = pickle.load(f)
    
# with open('models/linearconcat/1/concat_classifier.pkl', 'rb') as f:
#     concat_classifier = pickle.load(f)

In [69]:
def eval_acc(model, features, labels):
    preds = model.predict(features)
    return np.mean((labels == preds).astype(np.float64)) * 100.

def eval_score(model, features, labels):
    return model.score(features, labels) * 100.
    # return np.mean((labels == np.argmax(preds, axis=1)).astype(np.float64)) * 100.

In [70]:
train_image_acc = eval_score(image_classifier, train_image_features, train_labels)
test_image_acc = eval_score(image_classifier, test_image_features, test_labels)

# preds = image_classifier.predict(train_image_features)
# accuracy = np.mean((train_labels == preds).astype(np.float64)) * 100.
print(f"Train Accuracy = {train_image_acc:.3f}%")

# preds = image_classifier.predict(train_image_features)
# accuracy = np.mean((test_labels == preds).astype(np.float64)) * 100.
print(f"Test Accuracy = {test_image_acc:.3f}%")

Train Accuracy = 82.775%
Test Accuracy = 79.787%


In [47]:
train_text_acc = eval_acc(text_classifier, train_text_features, train_labels)
test_text_acc = eval_acc(text_classifier, test_text_features, test_labels)

# preds = text_classifier.predict(train_text_features)
# accuracy = np.mean((train_labels == preds).astype(np.float64)) * 100.
print(f"Train Accuracy = {train_text_acc:.3f}%")

# preds = text_classifier.predict(test_text_features)
# accuracy = np.mean((test_labels == preds).astype(np.float64)) * 100.
print(f"Test Accuracy = {test_text_acc:.3f}%")

Train Accuracy = 77.892%
Test Accuracy = 74.459%


In [48]:
train_concat_acc = eval_acc(concat_classifier, train_concat_features, train_labels)
test_concat_acc = eval_acc(concat_classifier, test_concat_features, test_labels)

# preds = concat_classifier.predict(train_concat_features)
# accuracy = np.mean((train_labels == preds).astype(np.float64)) * 100.
print(f"Train Accuracy = {train_concat_acc:.3f}%")

# preds = concat_classifier.predict(test_concat_features)
# accuracy = np.mean((test_labels == preds).astype(np.float64)) * 100.
print(f"Test Accuracy = {test_concat_acc:.3f}%")

Train Accuracy = 91.702%
Test Accuracy = 86.029%


In [72]:
df = pd.DataFrame({'model': ['concat', 'text', 'image'], 'Train Accuracy': [f"{train_concat_acc:.3f}%", f"{train_text_acc:.3f}%", f"{train_image_acc:.3f}%"], 'Test Accuracy': [f"{test_concat_acc:.3f}%", f"{test_text_acc:.3f}%", f"{test_image_acc:.3f}%"]})

df.to_latex('linearprobe-results.tex', index=False)
# \begin{tabular}{lll}
# \toprule
#  model & Train Accuracy & Test Accuracy \\
# \midrule
# concat &        91.702\% &       86.029\% \\
#   text &        77.892\% &       74.459\% \\
#  image &        82.775\% &       79.787\% \\
# \bottomrule
# \end{tabular}


# CLIP
# [36] 	Train Loss: 0.14999 | Train Acc: 82.17381%
# [36] 	 Val. Loss: 0.21184 |  Val. Acc: 72.90648%
# CLIP Text
# [30] 	Train Loss: 0.24503 | Train Acc: 66.63274%
# [30] 	 Val. Loss: 0.27108 |  Val. Acc: 63.29861%
# CLIP Image
# [25] 	Train Loss: 0.24470 | Train Acc: 66.43988%
# [25] 	 Val. Loss: 0.26212 |  Val. Acc: 64.21204%

  df.to_latex('linearprobe-results.tex', index=False)
