In [1]:
# 1. Imports
from transformers import CLIPModel, CLIPProcessor
import torch
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
device

BASE = "../../semiotics_in_tarot"   #  project root
DATA_DIR = os.path.join(BASE, "data")
OUTPUT_DIR = os.path.join(BASE, "outputs")

In [2]:
# 2. Load CLIP
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
model.eval()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [3]:
# 3. Load metadata
cards = pd.read_csv("../data/tarot_metadata.csv")
cards

Unnamed: 0,deck,deck_id,card,file,motif,year
0,Rider–Waite,RWS,The Fool,RWS/m00.jpg,fool,1900
1,Rider–Waite,RWS,The Magician,RWS/m01.jpg,magician,1900
2,Rider–Waite,RWS,The High Priestess,RWS/m02.jpg,high_priestess,1900
3,Rider–Waite,RWS,The Empress,RWS/m03.jpg,empress,1900
4,Rider–Waite,RWS,The Emperor,RWS/m04.jpg,emperor,1900
5,Rider–Waite,RWS,The Hierophant,RWS/m05.jpg,hierophant,1900
6,Rider–Waite,RWS,The Lovers,RWS/m06.jpg,lovers,1900
7,Rider–Waite,RWS,The Chariot,RWS/m07.jpg,chariot,1900
8,Rider–Waite,RWS,Strength,RWS/m08.jpg,justice,1900
9,Rider–Waite,RWS,The Hermit,RWS/m09.jpg,hermit,1900


In [4]:
# 4. Encode all card images into CLIP image embeddings

embs = []

for idx, row in cards.iterrows():
    img_path = os.path.join(DATA_DIR, row["file"])
    img = Image.open(img_path).convert("RGB")
    
    inputs = processor(images=img, return_tensors="pt").to(device)
    
    with torch.no_grad():
        feats = model.get_image_features(**inputs)  # shape: (1, dim)
    
    # L2-normalize
    feats = feats / feats.norm(dim=-1, keepdim=True)
    
    embs.append(feats.cpu().numpy()[0])

image_embeddings = np.vstack(embs)  # shape: (N_cards, dim)
cards["emb_index"] = np.arange(len(cards))

print("Embeddings shape:", image_embeddings.shape)
cards.head()
norms = np.linalg.norm(image_embeddings, axis=1)
norms[:10]


Embeddings shape: (44, 512)


array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.0000001 , 1.        , 1.0000001 , 0.99999994, 0.99999994],
      dtype=float32)

In [5]:
# 6. Helper functions

def get_card_indices(deck=None, card_label=None):
    """Return indices (in cards DataFrame and embedding array) for the given filter."""
    df = cards.copy()
    if deck is not None:
        df = df[df["deck"] == deck]
    if card_label is not None:
        df = df[df["card"] == card_label]
    return df.index.to_list(), df["emb_index"].to_list(), df

def get_card_embedding(deck_id, motif):
    """
    Return the embedding (np.array) and metadata row
    for a unique card identified by deck_id + motif.
    """
    df = cards[(cards["deck_id"] == deck_id) & (cards["motif"] == motif)]
    
    if len(df) == 0:
        raise ValueError(f"No card found for deck_id={deck_id}, motif={motif}")
    if len(df) > 1:
        raise ValueError(f"More than one card found for deck_id={deck_id}, motif={motif}")
    
    emb_idx = df["emb_index"].iloc[0]
    return image_embeddings[emb_idx], df.iloc[0]


In [9]:
deck_ids = sorted(cards["deck_id"].unique())
print(deck_ids)

motifs_of_interest = sorted(cards["motif"].unique())
print(motifs_of_interest)


['MARS', 'RWS']
['chariot', 'death', 'devil', 'emperor', 'empress', 'fool', 'hanged_man', 'hermit', 'hierophant', 'high_priestess', 'judgement', 'justice', 'lovers', 'magician', 'moon', 'star', 'strength', 'sun', 'temperance', 'tower', 'wheel_of_fortune', 'world']


In [7]:
# 6. Image–image similarities across decks for selected motifs

motifs_of_interest = ["The Fool", "The Magician", "The High Priestess", "The Empress", "The Emperor", "The Hierophant", 
                      "The Lovers", "The Chariot", "Strength", "The Hermit", "Wheel of Fortune", "Justice", "The Hanged Man",
                      "Death", "Temperance", "The Devil", "The Tower", "The Star", "The Moon", "The Sun", "Judgement",
                      "The World", "The Fool", "The Juggler", "The Popess", "The Empress", "The Emperor", "The Pope", "The Lovers",
                      "The Chariot", "Justice", "The Hermit", "Wheel of Fortune", "Strength", "The Hanged Man", "unnamed",
                      "Temperance", "The Devil", "The House of God", "The Star", "The Moon", "The Sun", "Judgement", "The World"]
decks = ["RWS", "Marseille"]

rows = []
for card_label in motifs_of_interest:
    row = {"card_label": card_label}
    emb_by_deck = {}
    
    # get embedding per deck for this card (if available)
    for deck in decks:
        try:
            emb, meta_row = get_card_embedding(deck, card_label)
            emb_by_deck[deck] = emb
        except ValueError:
            emb_by_deck[deck] = None
    
    # compute similarities
    for deck_a in decks:
        for deck_b in decks:
            key = f"sim_{deck_a}_{deck_b}"
            ea = emb_by_deck[deck_a]
            eb = emb_by_deck[deck_b]
            if ea is None or eb is None:
                row[key] = np.nan
            else:
                sim = cosine_similarity(ea.reshape(1, -1), eb.reshape(1, -1))[0, 0]
                row[key] = sim
    rows.append(row)

sim_df = pd.DataFrame(rows)
sim_df


Unnamed: 0,card_label,sim_RWS_RWS,sim_RWS_Marseille,sim_Marseille_RWS,sim_Marseille_Marseille
0,The Fool,,,,
1,The Magician,,,,
2,The High Priestess,,,,
3,The Empress,,,,
4,The Emperor,,,,
5,The Hierophant,,,,
6,The Lovers,,,,
7,The Chariot,,,,
8,Strength,,,,
9,The Hermit,,,,


In [11]:
# 7. Define text prompts for image–text analysis

text_prompts = {
    "fool": [
        "a tarot card of The Fool",
        "a tarot card of The Fool, symbolizing naivety and new beginnings",
        "a tarot card of The Fool, representing madness and folly"
    ],
    "sun": [
        "a tarot card of The Sun",
        "a tarot card of The Sun, symbolizing joy and success",
        "a tarot card of The Sun, symbolizing enlightenment"
    ],
    "tower": [
        "a tarot card of The Tower",
        "a tarot card of The Tower, symbolizing sudden catastrophe",
        "a tarot card of The Tower, symbolizing liberation and rupture"
    ],
}

def encode_text_prompts(prompt_list):
    inputs = processor(text=prompt_list, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        feats = model.get_text_features(**inputs)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu().numpy()


In [12]:
# 8. Image–text similarity for a single motif (e.g. "fool")

def image_text_similarity_for_motif(motif_label, decks=("RWS", "Marseille")):
    prompts = text_prompts[motif_label]
    text_embs = encode_text_prompts(prompts)  # shape: (n_prompts, dim)
    
    rows = []
    for deck in decks:
        try:
            img_emb, meta_row = get_card_embedding(deck, motif_label)
        except ValueError:
            continue
        sims = cosine_similarity(img_emb.reshape(1, -1), text_embs)[0]  # shape: (n_prompts,)
        
        row = {"deck": deck}
        for i, prompt in enumerate(prompts):
            row[f"sim_prompt_{i+1}"] = sims[i]
        rows.append(row)
    
    df = pd.DataFrame(rows)
    df.insert(0, "motif", motif_label)
    return df, prompts

fool_df, fool_prompts = image_text_similarity_for_motif("fool")
fool_df


Unnamed: 0,motif


In [13]:
# 9. Example for "sun" and "tower"

sun_df, sun_prompts = image_text_similarity_for_motif("sun")
tower_df, tower_prompts = image_text_similarity_for_motif("tower")

sun_df, tower_df


(Empty DataFrame
 Columns: [motif]
 Index: [],
 Empty DataFrame
 Columns: [motif]
 Index: [])

In [14]:
# 10. Pretty-print similarities for a motif

def print_motif_text_results(motif_label):
    df, prompts = image_text_similarity_for_motif(motif_label)
    print(f"Motif: {motif_label}")
    print("Prompts:")
    for i, p in enumerate(prompts, start=1):
        print(f"  {i}. {p}")
    print("\nSimilarities:")
    display(df)

print_motif_text_results("fool")


Motif: fool
Prompts:
  1. a tarot card of The Fool
  2. a tarot card of The Fool, symbolizing naivety and new beginnings
  3. a tarot card of The Fool, representing madness and folly

Similarities:


Unnamed: 0,motif
