# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Fall 2024</center>
<pre>Created: 06/19/2023; Revised: 10/02/2024</pre>

In [None]:
import numpy as np
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import torch
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as T

In [None]:
# This cell of code will determine if we have an accelerator for running
# our neural networks.
# mps == Apple Silicon device (MX series of Macbooks)
# cuda == Compute Unified Device Architecture is a toolkit from Nvidia and means we have a GPU
# cpu == Just using the general-purpose CPU for our calculations

if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device: {0}'.format(device))

In [None]:
# we are loading a transformer neural network (more on this architecture later this term)
# there are three components that we need: the model, the image processor, and the tokenizer
# we'll learn more about tokenization later, for now just know that this 

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14",
                                  torch_dtype=torch.float16,
                                  device_map="auto")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14",
                                          torch_dtype=torch.float16,
                                          clean_up_tokenization_spaces=True,
                                          device_map="auto")

In [None]:
def classify(image,labels):
    img = Image.open(image)
    inputs = processor(text = labels,
                       images = img, 
                       return_tensors = "pt",
                       padding = True) 
    outputs = model(**inputs.to(device))
    logits = outputs.logits_per_image
    probs = logits.softmax(dim = 1)
    # fix for float16 data
    return [(labels[c],np.round(probs[0][c].detach().to('cpu').numpy(),3)) 
            for c in torch.argsort(probs,descending=True)[0]][:10]

In [None]:
# load a sample image
!wget https://raw.githubusercontent.com/jeddobson/ENGL54.41-24F/6ebd8c683c3b0d230f16e99fdf7baa2113d10822/img/hood-03-library.jpg
img = Image.open("hood-03-library.jpg")

In [None]:
# let's display the image.
# This is Lori Nix, American, born 1969 | Kathleen Gerber, American, born 1967
# Library 2007, 2018.37.261, Hood Museum of Art. Dartmouth College.
# https://hoodmuseum.dartmouth.edu/objects/2018.37.261
plt.imshow(img)
plt.show() 

In [None]:
# As we did with VGG16, we need to preprocess the image. For now,
# we can supply just a sample stand-in caption.
inputs = processor(text = ["Library"],
                       images = img, 
                       return_tensors = "pt",
                       padding = True) 

In [None]:
# Let's see what is returned...
inputs.keys()

In [None]:
# We will learn more about these 'input_ids' when we get into
# Transformers more in depth. For now, understand that we are
# getting back representations of the text (descriptions), the
# images (as pixel values) and something called an attention_mask
# that can be used to filter pixel values for the model.

In [None]:
# Let's view the pre/processed image. Do you notice any similarities
# and differences from what we saw in the VGG16 CNN preprocessed
# data? What else do you see?
plt.imshow(inputs['pixel_values'][0].permute(1,2,0))
plt.show() 

In [None]:
# Okay, now we are going to define some possible labels as descriptions.
# 
captions = ["This is a photograph of a library",
            "This is a photograph of a library taken over by nature",
            "This is a realistic drawing of a library"]

In [None]:
# what do you see here? Anything of interest?
classify("hood-03-library.jpg",captions)

In [None]:
captions = ["This is a photograph of a library",
            "This is a photograph of a library taken over by nature",
            "This is a realistic drawing of a library",
           "This is a photograph of a diorama of a library"]

In [None]:
classify("hood-03-library.jpg",captions)

In [None]:
captions = ["This is a photograph of a library",
            "This is a photograph of a library taken over by nature",
            "This is a realistic drawing of a library",
            "This is a photograph of a diorama of a library",
           "This is a photograph of a diorama of a library taken over by nature"]

In [None]:
classify("hood-03-library.jpg",captions)

## Try it! 

Now use the image that you previously uploaded or upload another image and try some experiments using the classify() function.