In [2]:
!pip install open_clip_torch torch torchvision torchaudio
!pip install scikit-learn pandas matplotlib tqdm pillow

Collecting open_clip_torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp313-cp313-win_amd64.whl.metadata (6.7 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm (from open_clip_torch)
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Downloading open_clip_torch-2.32.0-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 737.4 kB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.5 MB 737.4 kB/s eta 0:00:02
   -----------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Step 1: Import necessary libraries
import os
import pandas as pd
import torch
import open_clip
from PIL import Image
from tqdm import tqdm
from sklearn.cluster import KMeans
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Step 2: Define paths
covers_path = r"C:\Users\tered\Desktop\recommender_system\data\images\covers"
clean_csv_path = r"C:\Users\tered\Desktop\recommender_system\data\processed\books_covers_clean.csv"
experiment_output = r"C:\Users\tered\Desktop\recommender_system\experiments\06_clip_kmeans"

In [5]:
# Step 3: Load the dataset
df = pd.read_csv(clean_csv_path)

In [6]:
# Step 4: Load the CLIP model from open_clip_torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.to(device)
model.eval()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [7]:
# Step 5: Extract image embeddings
image_embeddings = []
image_ids = []

print("Extracting CLIP embeddings using open_clip_torch...")

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    book_id = row['ID']
    image_file = os.path.join(covers_path, f"{book_id}.jpg")

    if not os.path.exists(image_file):
        continue

    try:
        image = preprocess(Image.open(image_file).convert("RGB")).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model.encode_image(image).cpu().numpy()
        image_embeddings.append(embedding.squeeze())
        image_ids.append(book_id)
    except Exception as e:
        print(f"Error processing image {book_id}: {e}")

Extracting CLIP embeddings using open_clip_torch...


 26%|██▋       | 1717/6481 [18:06<50:14,  1.58it/s]  


KeyboardInterrupt: 

In [None]:
# Step 6: Convert embeddings to NumPy array
embeddings_array = np.array(image_embeddings)

In [None]:
# Step 7: Apply KMeans clustering
num_clusters = 10  # You can change this value
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings_array)

In [None]:
# Step 8: Save results into a DataFrame
clip_df = pd.DataFrame(embeddings_array)
clip_df['ID'] = image_ids
clip_df['cluster'] = cluster_labels

In [None]:
# Save to CSV
output_file = os.path.join(experiment_output, "clip_embeddings_kmeans.csv")
clip_df.to_csv(output_file, index=False)

print(f"CLIP embeddings and clusters saved to: {output_file}")

In [None]:
#### PARTE 2

# Step 1: Additional imports
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Step 2: Prepare for similarity and lookup
from sklearn.metrics.pairwise import cosine_similarity

# Merge CLIP + original titles
merged_df = pd.merge(df, clip_df, on='ID', how='inner')

# Step 3: Get embeddings matrix
embedding_cols = [col for col in clip_df.columns if isinstance(col, int) or col.isdigit()]
features = clip_df[embedding_cols].values

# Step 4: Calculate similarity matrix
similarity_matrix = cosine_similarity(features)

# Step 5: Define target book index
target_index = list(merged_df.index[merged_df['ID'] == 2711])[0]
target_title = merged_df.iloc[target_index]['title']
target_id = merged_df.iloc[target_index]['ID']

print(f"Finding recommendations for book index: {target_index}")
print(f"Selected book [{target_id}]: {target_title}")

# Step 6: Get top 10 most similar
similarities = similarity_matrix[target_index]
top_indices = np.argsort(similarities)[::-1][1:11]  # Exclude itself

# Print recommendations
print("Top 10 Recommendations based on CLIP + Cosine Similarity:")
for i, idx in enumerate(top_indices):
    book_id = merged_df.iloc[idx]['ID']
    title = merged_df.iloc[idx]['title']
    score = similarities[idx]
    print(f"{i+1}. [{book_id}] {title} (Similarity: {score:.4f})")

# Step 7: Cosine Similarity Average
avg_sim = np.mean(similarities[top_indices])
print(f"\nAverage Similarity (Cosine) for top 10: {avg_sim:.4f}")

# Step 8: Silhouette Score
sil_score = silhouette_score(features, clip_df['cluster'])
print(f"Silhouette Score (KMeans Clustering): {sil_score:.4f}")

# Step 9: Intra-list Diversity (average pairwise cosine distance)
top_embeds = features[top_indices]
pairwise_dist = pairwise_distances(top_embeds, metric='cosine')
upper_triangle = pairwise_dist[np.triu_indices_from(pairwise_dist, k=1)]
intra_div = np.mean(upper_triangle)
print(f"Intra-list Diversity: {intra_div:.4f}")

# Step 10: Visualize with t-SNE
print("Generating t-SNE visualization...")
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(features)

plt.figure(figsize=(10, 7))
scatter = plt.scatter(tsne_results[:,0], tsne_results[:,1], c=clip_df['cluster'], cmap='tab10', s=15)
plt.title("t-SNE Visualization of Book Covers (CLIP Embeddings)")
plt.colorbar(scatter, label='Cluster')
plt.savefig(os.path.join(experiment_output, "tsne_visualization.png"))
plt.show()
