In [1]:
import torch
import clip
from PIL import Image
import pandas as pd

## Clip Testing

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("../../data/images/CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[0.9927   0.004185 0.003016]]


## Mall Dataset [Kaggle]

In [4]:
# reads tabular data and processes its data
df = pd.read_csv("../../data/Mall/labels.csv")
df.id = df.id.map(lambda x: str(x) + ".jpg")
df = df.rename(columns={"count": "y_true"})

# create a sample with random id's to interact with CLIP model
df_sample = df.sample(20, random_state=42).sort_values(by="y_true")
display(df_sample)

# get the images ids and number of people in the sample
image_id_list = df_sample.id.to_list()
print(image_id_list)
y_true = (list(set(map(lambda x: x+' people', df_sample["y_true"].astype(str).to_list()))))
print(y_true)

Unnamed: 0,id,y_true
584,585.jpg,20
56,57.jpg,23
1289,1290.jpg,25
65,66.jpg,27
1292,1293.jpg,28
1118,1119.jpg,28
374,375.jpg,30
1860,1861.jpg,30
938,939.jpg,31
1273,1274.jpg,33


['585.jpg', '57.jpg', '1290.jpg', '66.jpg', '1293.jpg', '1119.jpg', '375.jpg', '1861.jpg', '939.jpg', '1274.jpg', '276.jpg', '354.jpg', '129.jpg', '747.jpg', '1324.jpg', '1647.jpg', '906.jpg', '1853.jpg', '1732.jpg', '1334.jpg']
['34 people', '45 people', '20 people', '23 people', '27 people', '40 people', '25 people', '43 people', '44 people', '33 people', '31 people', '36 people', '42 people', '30 people', '28 people']


### Testing I (without fine-tuning) 

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

images_path = "../../data/Mall/frames/" 
output = []

for index in range(len(image_id_list)):
    image = preprocess(Image.open(images_path + image_id_list[index])).unsqueeze(0).to(device)
    text = clip.tokenize(y_true).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
        
        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # print("Label probs:", probs)

    # max_value and first_position_of_max
    max_value = max(probs.tolist()[0])
    max_index_position = probs.tolist()[0].index(max_value)

    # store the results in the output list
    output.append([image_id_list[index], 
                   y_true[max_index_position],
                   max_index_position,
                   max_value, 
                   probs[0]])
    
# merging dataframe with known features and clip model outputs 
df_results = pd.DataFrame(output, columns=["id", "y_pred", "max_index_position", "probability", "probabilities"])
df_results.y_pred = df_results.y_pred.apply(lambda x: x.split(' ')[0])
df_sample = df_sample.merge(df_results, on="id")
df_sample

Unnamed: 0,id,y_true,y_pred,max_index_position,probability,probabilities
0,585.jpg,20,30,13,0.090942,"[0.0578, 0.0697, 0.0854, 0.06058, 0.0697, 0.06..."
1,57.jpg,23,20,2,0.102051,"[0.05377, 0.06094, 0.10205, 0.0669, 0.07465, 0..."
2,1290.jpg,25,20,2,0.119324,"[0.04745, 0.05908, 0.1193, 0.0649, 0.0735, 0.0..."
3,66.jpg,27,20,2,0.083252,"[0.0609, 0.0609, 0.08325, 0.06287, 0.0723, 0.0..."
4,1293.jpg,28,20,2,0.109863,"[0.04797, 0.0616, 0.10986, 0.06665, 0.0791, 0...."
5,1119.jpg,28,20,2,0.096924,"[0.05438, 0.072, 0.0969, 0.06256, 0.0687, 0.06..."
6,375.jpg,30,20,2,0.118774,"[0.0495, 0.05524, 0.1188, 0.0677, 0.07434, 0.0..."
7,1861.jpg,30,20,2,0.117126,"[0.04733, 0.0589, 0.1171, 0.06995, 0.07214, 0...."
8,939.jpg,31,25,6,0.091614,"[0.05914, 0.0736, 0.0796, 0.07025, 0.07477, 0...."
9,1274.jpg,33,20,2,0.121338,"[0.0498, 0.05386, 0.12134, 0.067, 0.07477, 0.0..."


In [17]:
# explore about models and parameters for clip!

## Synthetic Crowd Counting