In [1]:
import torch
import clip
from PIL import Image
import pandas as pd

## Clip Testing

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("../data/images/CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[0.9927   0.004185 0.002968]]


## Crowd Counting - Testing I (without fine-tuning) 

In [77]:
# reads tabular data and processes its data
df = pd.read_csv("../data/images/CrowdCountingKaggle/labels.csv")
df.id = df.id.map(lambda x: str(x) + ".jpg")
df = df.rename(columns={"count": "y_true"})

# create a sample with random id's to interact with CLIP model
df_sample = df.sample(20, random_state=42).sort_values(by="y_true")
display(df_sample)

# get the images ids and number of people in the sample
image_id_list = df_sample.id.to_list()
print(image_id_list)
y_true = (list(set(map(lambda x: x+' people', df_sample["y_true"].astype(str).to_list()))))
print(y_true)

Unnamed: 0,id,y_true
584,585.jpg,20
56,57.jpg,23
1289,1290.jpg,25
65,66.jpg,27
1292,1293.jpg,28
1118,1119.jpg,28
374,375.jpg,30
1860,1861.jpg,30
938,939.jpg,31
1273,1274.jpg,33


['585.jpg', '57.jpg', '1290.jpg', '66.jpg', '1293.jpg', '1119.jpg', '375.jpg', '1861.jpg', '939.jpg', '1274.jpg', '276.jpg', '354.jpg', '129.jpg', '747.jpg', '1324.jpg', '1647.jpg', '906.jpg', '1853.jpg', '1732.jpg', '1334.jpg']
['43 people', '23 people', '34 people', '30 people', '20 people', '42 people', '31 people', '33 people', '25 people', '45 people', '27 people', '44 people', '40 people', '28 people', '36 people']


In [78]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

images_path = "../data/images/CrowdCountingKaggle/frames/" 
output = []

for index in range(len(image_id_list)):
    image = preprocess(Image.open(images_path + image_id_list[index])).unsqueeze(0).to(device)
    text = clip.tokenize(y_true).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
        
        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # print("Label probs:", probs)

    # max_value and first_position_of_max
    max_value = max(probs.tolist()[0])
    max_index_position = probs.tolist()[0].index(max_value)

    # store the results in the output list
    output.append([image_id_list[index], 
                   y_true[max_index_position],
                   max_index_position,
                   max_value, 
                   probs[0]])

In [None]:
# merging dataframe with known features and clip model outputs 
df_results = pd.DataFrame(output, columns=["id", "y_pred", "max_index_position", "probability", "probabilities"])
df_results.y_pred = df_results.y_pred.apply(lambda x: x.split(' ')[0])
df_sample = df_sample.merge(df_results, on="id")
df_sample

Unnamed: 0,id,y_true,y_pred,max_index_position,probability,probabilities
0,585.jpg,20,30,3,0.09082,"[0.06052, 0.05957, 0.05774, 0.0908, 0.0853, 0.06647, 0.06964, 0.05508, 0.073, 0.06964, 0.06964, ..."
1,57.jpg,23,20,4,0.100647,"[0.04984, 0.06604, 0.0539, 0.0991, 0.10065, 0.05646, 0.0671, 0.05142, 0.08887, 0.06107, 0.0748, ..."
2,1290.jpg,25,20,4,0.118896,"[0.04172, 0.06366, 0.04727, 0.1134, 0.1189, 0.0488, 0.0553, 0.04443, 0.0955, 0.05884, 0.07324, 0..."
3,66.jpg,27,20,4,0.08252,"[0.06323, 0.0623, 0.06036, 0.0763, 0.0825, 0.0673, 0.074, 0.05942, 0.074, 0.0613, 0.0728, 0.0567..."
4,1293.jpg,28,20,4,0.108215,"[0.04654, 0.0667, 0.04803, 0.0897, 0.1082, 0.05884, 0.05884, 0.04514, 0.0955, 0.06168, 0.07916, ..."
5,1119.jpg,28,20,4,0.096313,"[0.0532, 0.0622, 0.05402, 0.0877, 0.0963, 0.0632, 0.06122, 0.0532, 0.08636, 0.0727, 0.0683, 0.05..."
6,375.jpg,30,20,4,0.11792,"[0.04544, 0.0672, 0.04993, 0.1008, 0.1179, 0.0557, 0.06213, 0.04837, 0.09326, 0.0557, 0.07495, 0..."
7,1861.jpg,30,20,4,0.117249,"[0.0445, 0.0689, 0.04736, 0.1136, 0.11725, 0.05283, 0.0545, 0.0459, 0.0987, 0.05893, 0.0722, 0.0..."
8,939.jpg,31,25,8,0.091064,"[0.0626, 0.0687, 0.0588, 0.0588, 0.07916, 0.0698, 0.0656, 0.05612, 0.09106, 0.07434, 0.07434, 0...."
9,1274.jpg,33,20,4,0.119934,"[0.0421, 0.0662, 0.04922, 0.1092, 0.11993, 0.05078, 0.0549, 0.04483, 0.0994, 0.05405, 0.0751, 0...."


In [None]:
# explore about models and parameters for clip!