In [1]:
#!conda install pytorch==1.7.1 torchvision
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 490 kB/s eta 0:00:011
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25ldone
[?25h  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41913 sha256=068a45fccfd6d803e5788cb7108fa26847065687fed2ae02f6a40f06ef62fa6f
  Stored in directory: /home/ec2-user/.cache/pip/wheels/ff/2a/24/75041425faf3347ab146a4a3d0484f723b2c44a7966a06e3f0
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-6.0.3
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-od4kezzl
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-od4kezzl
Building wheels for collected packages: clip
  Build

### clip zero-shot

In [14]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
input_classes = ["bakery", "bathroom", "bowling","computerroom","dining_room","gym","hospitalroom","library","poolinside","toystore"]
text = clip.tokenize(input_classes).to(device)

def single_infer(image_path, text, input_classes):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text = clip.tokenize(["a barkery", "a bathroom", "a bowling","computerroom","dining_room","gym","hospitalroom","library","poolinside","toystore"]).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        class_ls = probs[0].tolist()
        pred_class_name = input_classes[class_ls.index(max(class_ls))]
        true_class_name = image_path.split('/')[-2]
        
        if true_class_name==pred_class_name:
            pred_flag = 1
        else:
            pred_flag = 0
    return pred_class_name,pred_flag

In [15]:
class_name, flag = single_infer("./data/Test/bathroom/room311.jpg",text,input_classes)
print ("class_name: ", class_name)

class_name:  bathroom


In [16]:
import os
import tqdm

right = 0
total = 0
for i in input_classes:
    folder = os.path.join('./data/Validation',i)
    image = os.listdir(folder)
    #print (image)
    for j in image:
        input_path = os.path.join(folder,j)
        class_name, flag = single_infer(input_path,text,input_classes)
        right = right + flag
        total = total +1

In [17]:
print ("total accuracy: ", right/total)

total accuracy:  0.8901869158878505


### clip + logistic regression (sklearn)

In [24]:
from torchvision import transforms
from torchvision import datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from tqdm import tqdm

organised_data_dir = "./data/"
transformation_train = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)),
    transforms.ColorJitter(brightness=1, contrast=1, saturation=1),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
 
transformation_valid = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transformation_test = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.ImageFolder(root= str(organised_data_dir) + "Train",      transform=preprocess)
valid_dataset = datasets.ImageFolder(root= str(organised_data_dir) + "Validation", transform=preprocess)
test_dataset  = datasets.ImageFolder(root= str(organised_data_dir) + "Test",       transform=preprocess)

# constructing data loaders.
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=100, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_dataset,  batch_size=100, shuffle=True)


In [25]:
%%time

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(dataset):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train_loader)
valid_features, valid_labels = get_features(valid_loader)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(valid_features)
accuracy = np.mean((valid_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")


100%|██████████| 16/16 [00:23<00:00,  1.46s/it]
100%|██████████| 5/5 [00:03<00:00,  1.28it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy = 97.897
CPU times: user 40.8 s, sys: 484 ms, total: 41.2 s
Wall time: 27.6 s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
