## Linear probe

In [2]:
! pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-baa51z12
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s  eta 0:00:01
[?25hCollecting regex
  Downloading regex-2022.9.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (752 kB)
[K     |████████████████████████████████| 752 kB 27.6 MB/s eta 0:00:01
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting wcwidth>=0.2.5
  Downloading wcwidth-0.2.5-py2.py3-none-any.whl (30 kB)
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369396 sha256=6a86809c8089d846593109186093f3eed2650b95ad23999a74249ea7621ba455
  Stored in directory: /tmp/pip-ephem-wheel-cache-nfc3b4xn/wheels/fd/b9/c3/5b4470e35ed76e174bff77c92f91da82098d5e35fd5bc8cdac
Successfu

In [1]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

from tqdm import tqdm
from sklearn.metrics import f1_score

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-L/14', device)

# Load the dataset
data_dir = 'video/imgs/train/'
image_datasets = datasets.ImageFolder(data_dir, transform=preprocess)

train, val, test = torch.utils.data.random_split(image_datasets, [16000, 4000, 2424])
class_names = image_datasets.classes
# data_transforms ={ 'train': preprocess, 'test': preprocess}

# data_dir = 'video/imgs/'
# image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
#                                           data_transforms[x])
#                   for x in ['train', 'test']}
# dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
#                                              shuffle=True, num_workers=4)
#               for x in ['train', 'test']}
# dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'test']}
# class_names = image_datasets['train'].classes



def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device)) #feature shape([100,768])

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
val_features, val_labels = get_features(val)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.616, max_iter=3000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
print(f1_score(test_labels, predictions, average=None))

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

100%|██████████| 160/160 [07:51<00:00,  2.95s/it]
100%|██████████| 40/40 [01:58<00:00,  2.96s/it]
100%|██████████| 25/25 [01:10<00:00,  2.83s/it]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.96230599 0.99775281 0.96240602 0.98534799 0.98800959 0.99065421
 0.99557522 0.94420601 0.98624754 0.98989899]
Accuracy = 98.020


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


## hyperparameter tuning grid search

In [20]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    figure(figsize=(18, 16))
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


# Get a batch of training data
# inputs, classes = next(iter(dataloaders['train']))
inputs, classes = next(iter(tqdm(DataLoader(dataset, batch_size=5))))
print(classes.shape)
print(inputs.shape)

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[x] for x in classes])

NameError: name 'dataset' is not defined

In [2]:
from sklearn.model_selection import GridSearchCV

logModel = LogisticRegression()
param_grid = [    
    {'penalty' : ['l1','l2'],
    'C' : np.logspace(-6, 6, 50),
    'solver' : ['lbfgs','newton-cg'],
    'max_iter' : [1000,2500,3000,4500]
    }
]
clf = GridSearchCV(logModel, param_grid = param_grid, verbose=True, n_jobs=-1)

In [3]:
best_clf = clf.fit(val_features,val_labels)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   59.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 52.7min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 101.0min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 119.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [4]:
best_clf.best_estimator_

LogisticRegression(C=212.09508879201925, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=3000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
print (f'Accuracy - : {best_clf.score(test_features,test_labels):.3f}')

Accuracy - : 0.847


## Accuracies

In [10]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report

y_true = test_labels
y_pred = predictions

# average_precision_score(y_true, y_pred) #mAP
# f1_score(y_true, y_pred, average=None) #f1
# precision_score(y_true, y_pred, average=None) #precision
# recall_score(y_true, y_pred, average=None) #recall

# target_names = ['class 0', 'class 1', 'class 2', 'class 3','class 4','class 5','class 6','class 7']
target_names = class_names
print(classification_report(y_true, y_pred, target_names=target_names))


                                                               precision    recall  f1-score   support

      driver is adjusting his or her hair while driving a car       0.85      0.80      0.82        44
   driver is drinking water from a bottle while driving a car       0.74      0.78      0.76        37
                         driver is eating while driving a car       0.74      0.74      0.74        38
   driver is picking something from floor while driving a car       0.93      0.90      0.92        30
driver is reaching behind to the backseat while driving a car       0.95      0.98      0.97        43
driver is singing a song with music and smiling while driving       0.73      0.85      0.79        39
   driver is talking to the phone on hand while driving a car       0.96      0.94      0.95        51
                        driver is yawning while driving a car       0.86      0.75      0.80        32

                                                     accuracy          

## ViT-B/16 backbone

In [5]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from torchvision import transforms, datasets

from tqdm import tqdm
from sklearn.metrics import f1_score

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/16', device)

# Load the dataset
data_dir = 'new_data/'
image_datasets = datasets.ImageFolder(data_dir, transform=preprocess)

train, val, test = torch.utils.data.random_split(image_datasets, [9000, 2000, 1043])
class_names = image_datasets.classes

print(class_names)

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device)) #feature shape([100,768])

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
val_features, val_labels = get_features(val)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.42919342601287785, max_iter=2500, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
print(f1_score(test_labels, predictions, average=None))

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")


100%|████████████████████████████████████████| 335M/335M [00:01<00:00, 184MiB/s]
  0%|          | 0/90 [00:00<?, ?it/s]

['driver is adjusting his or her hair while driving a car', 'driver is drinking water from a bottle while driving a car', 'driver is eating while driving a car', 'driver is picking something from floor while driving a car', 'driver is reaching behind to the backseat while driving a car', 'driver is singing a song with music and smiling while driving', 'driver is talking to the phone on hand while driving a car', 'driver is yawning while driving a car']


100%|██████████| 90/90 [01:21<00:00,  1.11it/s]
100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
100%|██████████| 11/11 [00:09<00:00,  1.13it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.74404762 0.68679245 0.57142857 0.87136929 0.85714286 0.6970684
 0.87412587 0.55238095]
Accuracy = 74.497


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min finished


## ViT-B/32 backbone

In [30]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from torchvision import transforms, datasets

from tqdm import tqdm
from sklearn.metrics import f1_score

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
data_dir = 'new_data/'
image_datasets = datasets.ImageFolder(data_dir, transform=preprocess)

train, val, test = torch.utils.data.random_split(image_datasets, [8000, 2000, 314])
class_names = image_datasets.classes

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device)) #feature shape([100,768])

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
val_features, val_labels = get_features(val)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.42919342601287785, max_iter=2500, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
print(f1_score(test_labels, predictions, average=None))

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")


100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 233MiB/s]
100%|██████████| 80/80 [01:23<00:00,  1.05s/it]
100%|██████████| 20/20 [00:20<00:00,  1.04s/it]
100%|██████████| 4/4 [00:03<00:00,  1.18it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.66666667 0.6984127  0.73239437 0.88571429 0.91891892 0.7047619
 0.75862069 0.7012987 ]
Accuracy = 75.478


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min finished


## RN101 backbone

In [31]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from torchvision import transforms, datasets

from tqdm import tqdm
from sklearn.metrics import f1_score

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('RN101', device)

# Load the dataset
data_dir = 'new_data/'
image_datasets = datasets.ImageFolder(data_dir, transform=preprocess)

train, val, test = torch.utils.data.random_split(image_datasets, [8000, 2000, 314])
class_names = image_datasets.classes

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device)) #feature shape([100,768])

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
val_features, val_labels = get_features(val)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.42919342601287785, max_iter=2500, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
print(f1_score(test_labels, predictions, average=None))

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")


100%|███████████████████████████████████████| 278M/278M [00:03<00:00, 91.8MiB/s]
100%|██████████| 80/80 [01:01<00:00,  1.31it/s]
100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
100%|██████████| 4/4 [00:02<00:00,  1.47it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.57777778 0.65714286 0.41509434 0.80519481 0.79452055 0.52941176
 0.64285714 0.50632911]
Accuracy = 61.783


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.3s finished


In [3]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

from tqdm import tqdm
from sklearn.metrics import f1_score

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-L/14', device)

# Load the dataset
data_dir = 'video/imgs/train/'
image_datasets = datasets.ImageFolder(data_dir, transform=preprocess)

train, val, test = torch.utils.data.random_split(image_datasets, [16000, 4000, 2424])
class_names = image_datasets.classes
print(class_names)

['driver adjusting hair and makeup while driving a car', 'driver drinking water from a bottle while driving a car', 'driver normally driving a car', 'driver operating the radio while driving a car', 'driver reaching behind while driving a car', 'driver talking on the phone in left hand while driving a car', 'driver talking on the phone in right hand while driving a car', 'driver talking to passenger while driving a car', 'driver texting on the phone in left hand while driving a car', 'driver texting on the phone in right hand while driving a car']


In [4]:
print(type(train))

<class 'torch.utils.data.dataset.Subset'>
