Collection of methods used to pull down openlibrary data dump and get cover image embeddings using OpenAI's Clip model


In [1]:
import pandas as pd
import requests
import os
import gzip
import json
import numpy as np

import pprint
import tempfile

from typing import Dict, Text

from ast import literal_eval

# import faiss
import torch
import skimage
# import pinecone
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
import IPython.display
import matplotlib.pyplot as plt
# from datasets import load_dataset
from collections import OrderedDict
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

In [2]:
import PIL

PIL.Image.MAX_IMAGE_PIXELS = None

[link text](https://)Step 1: Download data dump from https://openlibrary.org/data; process in batches with pandas

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def download_data_dump():
  chunk_size = 1000000  # Adjust the chunk size as needed
  file_path = '/content/drive/MyDrive/book_data/ol_dump_editions_latest.txt.gz'  # Replace with the path to your large file


  # Read and process each chunk, then output to separate files
  for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', header=0, sep='\t', quotechar='"')):
      output_file_path = f'/content/drive/MyDrive/book_data/processed_batches/processed_chunk_{i + 1}.csv'
      chunk.to_csv(output_file_path, index=False)

      print(f"Processed chunk {i + 1}. Output saved to {output_file_path}")

In [None]:
download_data_dump()

Processed chunk 1. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_1.csv
Processed chunk 2. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_2.csv
Processed chunk 3. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_3.csv
Processed chunk 4. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_4.csv
Processed chunk 5. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_5.csv
Processed chunk 6. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_6.csv
Processed chunk 7. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_7.csv
Processed chunk 8. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_8.csv
Processed chunk 9. Output saved to /content/drive/MyDrive/book_data/processed_batches/processed_chunk_9.csv
Processed chunk 10. Output s

In [5]:

def process_raw_files():
  # Directory where processed chunk files are stored
  processed_chunks_dir = '/content/drive/MyDrive/book_data/processed_batches/'  # Replace with the path to your directory
  select_columns = ['title', 'isbn_10', 'isbn_13', 'publish_date', 'key', 'subjects', 'languages', 'description.value', 'genres']

  # List to store DataFrames of processed chunks
  processed_chunks = []

  # Column names for your DataFrame
  column_names = ['col1', 'col2', 'col3', 'col4', 'col5']  # original data tab separated columns

  # Read each processed chunk file and append to the list
  for i, filename in enumerate(os.listdir(processed_chunks_dir)):
      if filename.endswith('.csv'):
          print(f'Processing file {filename}....')
          file_path = os.path.join(processed_chunks_dir, filename)
          chunk_df = pd.read_csv(file_path, names=column_names)
          books_info_lst = chunk_df['col5'].tolist()

          # raw data stored in column as json
          books_info_lst = [json.loads(book_record) for book_record in books_info_lst]
          books_df = pd.json_normalize(books_info_lst)

          # narrow down amount of data by only outputting english language books and books greater than 100 pages
          books_df['languages_unpacked'] = books_df['languages'].apply(lambda lst: ', '.join(d['key'] for d in lst) if isinstance(lst, list) else '')
          df_languages_keys = books_df[books_df['languages_unpacked'] =='/languages/eng']
          df_page_cnt_filter = df_languages_keys[df_languages_keys['number_of_pages'] >= 100]

          final_df = df_page_cnt_filter
          final_df = final_df[select_columns]
          final_df.to_csv(f'/content/drive/MyDrive/book_data/all_raw/books_{i}.csv')



In [None]:
process_raw_files()

Processing file processed_chunk_1.csv....
Processing file processed_chunk_2.csv....
Processing file processed_chunk_3.csv....
Processing file processed_chunk_4.csv....
Processing file processed_chunk_5.csv....
Processing file processed_chunk_6.csv....
Processing file processed_chunk_7.csv....
Processing file processed_chunk_8.csv....
Processing file processed_chunk_9.csv....
Processing file processed_chunk_10.csv....
Processing file processed_chunk_11.csv....
Processing file processed_chunk_12.csv....
Processing file processed_chunk_13.csv....
Processing file processed_chunk_14.csv....
Processing file processed_chunk_15.csv....
Processing file processed_chunk_16.csv....
Processing file processed_chunk_17.csv....
Processing file processed_chunk_18.csv....
Processing file processed_chunk_19.csv....
Processing file processed_chunk_20.csv....
Processing file processed_chunk_21.csv....
Processing file processed_chunk_22.csv....
Processing file processed_chunk_23.csv....
Processing file proc

Step 2: Read in full data dump and filter for desired data (in this case recent (between 2020-2023) adult fiction

In [6]:
def read_csvs_in_directory(directory):
    dataframes = []
    for filename in os.listdir(directory):
        print(f'Processing file {filename}....')
        if filename.endswith(".csv"):
            csv_path = os.path.join(directory, filename)
            books_df = pd.read_csv(csv_path)

            dataframes.append(books_df)
    # return dataframes
    return pd.concat(dataframes, ignore_index=True)

In [None]:
read_csvs_in_directory (all_raw)

Processing file books_0.csv....
Processing file books_1.csv....
Processing file books_2.csv....
Processing file books_3.csv....
Processing file books_4.csv....
Processing file books_5.csv....
Processing file books_6.csv....
Processing file books_7.csv....
Processing file books_8.csv....
Processing file books_9.csv....
Processing file books_10.csv....
Processing file books_11.csv....
Processing file books_12.csv....
Processing file books_13.csv....
Processing file books_14.csv....
Processing file books_15.csv....
Processing file books_16.csv....
Processing file books_17.csv....
Processing file books_18.csv....
Processing file books_19.csv....
Processing file books_20.csv....
Processing file books_21.csv....
Processing file books_22.csv....
Processing file books_23.csv....
Processing file books_24.csv....
Processing file books_25.csv....
Processing file books_26.csv....
Processing file books_27.csv....
Processing file books_28.csv....
Processing file books_29.csv....
Processing file book

Unnamed: 0.1,Unnamed: 0,title,isbn_10,isbn_13,publish_date,key,subjects,languages,description.value,genres
0,4,Encyclopedia of designs for quilting,['0891458875'],,1996,/books/OL1000121M,['Quilting -- Patterns.'],[{'key': '/languages/eng'}],,
1,9,Peacemaking strategies in Northern Ireland,['0312163460'],,1997,/books/OL1000199M,"['Conflict management -- Northern Ireland.', '...",[{'key': '/languages/eng'}],Competing theories of conflict management can ...,
2,34,Musicology and performance,['0300068050'],,1997,/books/OL1000711M,"['Music -- History and criticism', 'Musicology...",[{'key': '/languages/eng'}],Arriving in the United States at age twenty-se...,
3,40,The return of the wolf to Yellowstone,['0805031014'],,1997,/books/OL1000771M,['Wolves -- Reintroduction -- Yellowstone Nati...,[{'key': '/languages/eng'}],,
4,48,The Mystery of the Secret Message,"['0807554294', '0807554308']",,1996,/books/OL1000890M,"['Brothers and sisters -- Fiction.', 'Orphans ...",[{'key': '/languages/eng'}],While helping their grandfather prepare for Gr...,
...,...,...,...,...,...,...,...,...,...,...
12745832,308028,"Isolation, characterization, and utilization o...",['3540616969'],,1997,/books/OL999641M,"['Developmental neurophysiology.', 'Neural ste...",[{'key': '/languages/eng'}],In trying to understand postnatal neurogenesis...,
12745833,308032,An investigation into the thought of Kōgyō D...,,,1998,/books/OL99967M,,[{'key': '/languages/eng'}],,
12745834,308038,Turn the cup around,['0385322925'],,1997,/books/OL999767M,"['Grandmothers -- Fiction.', 'Brothers and sis...",[{'key': '/languages/eng'}],Mysterious paintings in a cave near her grandm...,
12745835,308040,Abnormal illness behaviour,['0471965731'],,1997,/books/OL999781M,"['Somatoform disorders.', 'Hypochondriasis -- ...",[{'key': '/languages/eng'}],,


In [7]:
def get_image_id(row):
    isbn_10 = row['isbn_10']
    isbn_13 = row['isbn_13']
    image_id = None
    if isbn_10 is not np.nan:
        image_id = literal_eval(isbn_10)[0]
    elif isbn_13 is not np.nan:
        image_id = literal_eval(isbn_13)[0]

    return image_id

In [None]:
def get_image_url(image_id):
  if image_id:
    return f'https://covers.openlibrary.org/b/isbn/{image_id}.jpg'
  return None

In [8]:
def get_filtered_data():
  parent_directory = '/content/drive/MyDrive/book_data/all_raw'
  result_df = read_csvs_in_directory(parent_directory)
  fiction_df = result_df[result_df['subjects'].fillna('').str.contains('Fiction', case=False)]

  adult_fiction_df = fiction_df[~fiction_df['subjects'].str.contains('Juvenile', case=False)]

  recent_fiction_df = adult_fiction_df[adult_fiction_df['publish_date'].fillna('').str.contains('|'.join(['2020', '2021', '2022', '2023']), case=False)]

  # get image id which is isbn_13 or isbn_13 code then use to build urls

  recent_fiction_df['image_id'] = recent_fiction_df.apply(get_image_id, axis=1)
  recent_fiction_df['image_url'] = recent_fiction_df['image_id'].apply(get_image_url)

  recent_fiction_df = recent_fiction_df[recent_fiction_df['image_url'].notna()]

  recent_fiction_df.to_csv('/content/drive/MyDrive/book_data/recent_fiction.csv')

In [None]:
# Execute this cell
get_filtered_data()

Processing file books_0.csv....
Processing file books_1.csv....
Processing file books_2.csv....
Processing file books_3.csv....
Processing file books_4.csv....
Processing file books_5.csv....
Processing file books_6.csv....
Processing file books_7.csv....
Processing file books_8.csv....
Processing file books_9.csv....
Processing file books_10.csv....
Processing file books_11.csv....
Processing file books_12.csv....
Processing file books_13.csv....
Processing file books_14.csv....
Processing file books_15.csv....
Processing file books_16.csv....
Processing file books_17.csv....
Processing file books_18.csv....
Processing file books_19.csv....
Processing file books_20.csv....
Processing file books_21.csv....
Processing file books_22.csv....
Processing file books_23.csv....
Processing file books_24.csv....
Processing file books_25.csv....
Processing file books_26.csv....
Processing file books_27.csv....
Processing file books_28.csv....
Processing file books_29.csv....
Processing file book

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_fiction_df['image_id'] = recent_fiction_df.apply(get_image_id, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_fiction_df['image_url'] = recent_fiction_df['image_id'].apply(get_image_url)


Step 3: Batch through filtered dataset, pull down images, and save embeddings

In [9]:
def get_image(image_URL):
  if image_URL:
    try:
      response = requests.get(image_URL)
      image = Image.open(BytesIO(response.content)).convert("RGB")
      width, height = image.size
      if width == 1 and height == 1:
        return None
      return image
    except:
      print(f"Error: {image_URL}")
      return None
    return None

In [10]:
def get_embedding_batch(images):

  model_ID = "openai/clip-vit-base-patch32"
  device = "cuda" if torch.cuda.is_available() else "cpu"

  model = CLIPModel.from_pretrained(model_ID).to(device)
  processor = CLIPProcessor.from_pretrained(model_ID)
      # Get the tokenizer
  tokenizer = CLIPTokenizer.from_pretrained(model_ID)

  try:
    batch_tensor = torch.stack([processor(images=img, return_tensors="pt", padding=True)['pixel_values'][0] for img in images])
    embedding = model.get_image_features(batch_tensor)

    return embedding.cpu().detach().numpy()
  except:
      print(f'Error: image in batch does not exist or cannot be downloaded')
      return []
  return []

In [11]:
def batch_process_get_embeddings():
  filtered_df = pd.read_csv('/content/drive/MyDrive/book_data/recent_fiction.csv')
  directory_path = '/content/drive/MyDrive/book_data/batched_embeddings/'
  directory_files = os.listdir(directory_path)

  # get iterator starting point from file name to kick process off at file where it failed
  latest_ind = 0
  if len(directory_files) > 0:
    latest_file = sorted(directory_files)[len(directory_files) - 1]
    latest_ind = int(latest_file.split('_')[1])

  step_size = 100
  images = []
  selected_rows = None
  for start_row in range(latest_ind, 10000, step_size):
      print(f'Processing from row {start_row}')
      # Calculate the end row for each iteration
      end_row = min(start_row + step_size - 1, len(filtered_df) - 1)

      # Get the group of rows for the current iteration
      selected_rows = filtered_df.iloc[start_row:end_row + 1]


      image_embedding_df_cols = ['key', 'image']


      selected_rows["image"] = selected_rows[str("image_url")].apply(get_image)

      image_df = selected_rows[selected_rows['image'].notna()]
      image_df = image_df[image_embedding_df_cols]


      image_embeddings = []
      chunk_size = 5
      for i in range(0, len(image_df['image'].values), chunk_size):
        print(f'Processing images from {i}')
        chunk_df = image_df.iloc[i:i + chunk_size]

        chunk_embeddings = get_embedding_batch(chunk_df['image'].values)

        image_embeddings.extend(list(chunk_embeddings))



      image_df['image_embeddings'] = list(image_embeddings)


      merged_df = pd.merge(selected_rows, image_df, on='key', how='left')


      merged_df.to_csv(f'/content/drive/MyDrive/book_data/batched_embeddings/{start_row}_{start_row + step_size}_w_embeddings.csv')


In [None]:
batch_process_get_embeddings()

Processing from row 7800
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Processing images from 40
Processing images from 45
Saved batch 7800 to 7900
Processing from row 7900
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Processing images from 40
Saved batch 7900 to 8000
Processing from row 8000
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Processing images from 40
Saved batch 8000 to 8100
Processing from row 8100
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
P

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Processing images from 0


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json
Retrying in 2s [Retry 2/5].


Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Processing images from 40
Saved batch 8400 to 8500
Processing from row 8500
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Processing images from 40
Processing images from 45
Saved batch 8500 to 8600
Processing from row 8600
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30
Processing images from 35
Saved batch 8600 to 8700
Processing from row 8700
Processing images from 0
Processing images from 5
Processing images from 10
Processing images from 15
Processing images from 20
Processing images from 25
Processing images from 30

In [12]:
def read_csvs_in_directory(directory):
    dataframes = []
    for filename in os.listdir(directory):
        print(f'Processing file {filename}....')
        if filename.endswith(".csv"):
            csv_path = os.path.join(directory, filename)
            books_df = pd.read_csv(csv_path)

            dataframes.append(books_df)
    # return dataframes
    return pd.concat(dataframes, ignore_index=True)

In [13]:
def pkl_embeddings():
  result_df = read_csvs_in_directory('/content/drive/MyDrive/book_data/batched_embeddings/')
  result_df['image_embeddings'] = result_df['image_embeddings'].str.strip('[]').str.split().apply(lambda x: np.array(x).astype(float)).to_numpy()
  image_df = result_df[result_df['image_embeddings'].notna()]
  image_df['image_embeddings'] = image_df['image_embeddings'].apply(lambda x: x.reshape(1, -1))

  image_df = image_df[image_df['image_embeddings'].notna()]

  image_df.to_pickle('/content/drive/MyDrive/book_data/image_embeddings.pkl');

In [None]:
 pkl_embeddings()

Processing file 0_100_w_embeddings.csv....
Processing file 100_200_w_embeddings.csv....
Processing file 200_300_w_embeddings.csv....
Processing file 300_400_w_embeddings.csv....
Processing file 400_500_w_embeddings.csv....
Processing file 500_600_w_embeddings.csv....
Processing file 600_700_w_embeddings.csv....
Processing file 700_800_w_embeddings.csv....
Processing file 800_900_w_embeddings.csv....
Processing file 900_1000_w_embeddings.csv....
Processing file 1000_1100_w_embeddings.csv....
Processing file 1100_1200_w_embeddings.csv....
Processing file 1200_1300_w_embeddings.csv....
Processing file 1300_1400_w_embeddings.csv....
Processing file 1400_1500_w_embeddings.csv....
Processing file 1500_1600_w_embeddings.csv....
Processing file 1600_1700_w_embeddings.csv....
Processing file 1700_1800_w_embeddings.csv....
Processing file 1800_1900_w_embeddings.csv....
Processing file 1900_2000_w_embeddings.csv....
Processing file 2000_2100_w_embeddings.csv....
Processing file 2100_2200_w_embedd

  return pd.concat(dataframes, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_df['image_embeddings'] = image_df['image_embeddings'].apply(lambda x: x.reshape(1, -1))


In [15]:
import pandas as pd
file_path = '/content/drive/MyDrive/book_data/image_embeddings.pkl'  # Replace with the actual path
loaded_df = pd.read_pickle(file_path)

In [16]:
# Display the first few rows
print(loaded_df.head())

    # Get information about the DataFrame
print(loaded_df.info())

    # See the column names
print(loaded_df.columns)

    Unnamed: 0.2 Unnamed: 0.1 Unnamed: 0                    title isbn_10  \
0            0.0        17643      73504  Four Days of You and Me     NaN   
3            3.0        17663      73573          Jessica of Russ     NaN   
7            7.0        19325      76688           Tiamat's Wrath     NaN   
13          13.0        21446      79741             Paranorthern     NaN   
15          15.0        21513      79854           Shot at Normal     NaN   

              isbn_13 publish_date                 key  \
0   ['9781492684138']         2020  /books/OL28087551M   
3   ['9781892784476']         2020  /books/OL28105701M   
7   ['9780316332897']         2020  /books/OL28940416M   
13  ['9780358168997']         2021  /books/OL29845365M   
15  ['9780374380953']         2021  /books/OL29882268M   

                                       subjects                    languages  \
0                        ["Children's fiction"]  [{'key': '/languages/eng'}]   
3   ['Fiction, historical', 

In [17]:
query_book_index = 1  # Replace with the index of the book you want to query
query_embedding = loaded_df.iloc[query_book_index]['image_embeddings']

In [18]:
from scipy.spatial.distance import cosine
import numpy as np

# Reshape the query embedding to be 1-dimensional
query_embedding_1d = query_embedding.reshape(-1)

# Reshape the embeddings in the DataFrame to be 1-dimensional before calculating cosine similarity
# Calculate cosine distances between the query embedding and all other embeddings
similarities = loaded_df['image_embeddings'].apply(lambda embedding: 1 - cosine(query_embedding_1d, embedding.reshape(-1)))

# Add the similarities as a new column to your DataFrame
loaded_df['similarity'] = similarities

In [19]:
# Sort the DataFrame by similarity in descending order
# Exclude the query book itself from the results
sorted_similar_books = loaded_df.sort_values(by='similarity', ascending=False)

# Display the top N most similar books (excluding the query book at index 0)
# Let's show the top 10 similar books as an example
top_n = 10
print(f"Top {top_n} most similar books to the book at index {query_book_index}:")
# The most similar book will be the query book itself, so we slice from index 1 onwards
display(sorted_similar_books.iloc[1:top_n+1])

Top 10 most similar books to the book at index 1:


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,isbn_10,isbn_13,publish_date,key,subjects,languages,description.value,genres,image_id,image_url,image_x,image_y,image_embeddings,similarity
6866,66.0,9079266,679955,Sit a Tall Horse,,['9781885210333'],2020,/books/OL28098717M,"['Fiction, general']",[{'key': '/languages/eng'}],,,9781885210333,https://covers.openlibrary.org/b/isbn/97818852...,<PIL.Image.Image image mode=RGB size=333x500 a...,<PIL.Image.Image image mode=RGB size=333x500 a...,"[[-0.510898113, 0.252963483, 0.356260955, 0.25...",0.703768
2693,93.0,3612914,402519,Grief Rituals,,['9781951658274'],2022,/books/OL42831369M,"['Fiction, horror', 'Fiction, short stories (s...",[{'key': '/languages/eng'}],,,9781951658274,https://covers.openlibrary.org/b/isbn/97819516...,<PIL.Image.Image image mode=RGB size=313x500 a...,<PIL.Image.Image image mode=RGB size=313x500 a...,"[[0.370317638, 0.32435286, -0.0879774615, 0.17...",0.682013
7415,15.0,9777389,654029,Home and the World,,['9788186685273'],2020,/books/OL29019413M,"['Fiction, historical', 'India, fiction']",[{'key': '/languages/eng'}],,,9788186685273,https://covers.openlibrary.org/b/isbn/97881866...,<PIL.Image.Image image mode=RGB size=329x500 a...,<PIL.Image.Image image mode=RGB size=329x500 a...,"[[-0.175874263, -0.0386425853, 0.315994769, 0....",0.67922
9617,,12663883,956191,Mullin,,['9781732144231'],2020,/books/OL28106675M,"['Biography', 'Fiction, historical']",[{'key': '/languages/eng'}],,,9781732144231,https://covers.openlibrary.org/b/isbn/97817321...,,,"[[0.0455101132, 0.228901029, -0.182094693, 0.3...",0.677788
3833,33.0,5055210,525263,Society,,['9781736599150'],2021,/books/OL35566844M,"['Fiction, visionary & metaphysical', 'Fiction...",[{'key': '/languages/eng'}],,,9781736599150,https://covers.openlibrary.org/b/isbn/97817365...,<PIL.Image.Image image mode=RGB size=266x400 a...,<PIL.Image.Image image mode=RGB size=266x400 a...,"[[0.251824319, -0.00143073406, 0.503302813, -0...",0.670004
3904,4.0,5150495,931162,In the Country Dark,,['9781940249186'],2020,/books/OL30241665M,"['Crime', 'Fiction, mystery & detective, gener...",[{'key': '/languages/eng'}],,,9781940249186,https://covers.openlibrary.org/b/isbn/97819402...,<PIL.Image.Image image mode=RGB size=313x500 a...,<PIL.Image.Image image mode=RGB size=313x500 a...,"[[0.340414703, -0.0206347294, 0.36949271, 0.47...",0.665638
2644,44.0,3561469,159678,Mad Patagonian Part One,,['9781955823005'],2021,/books/OL35141497M,"['Fiction, general']",[{'key': '/languages/eng'}],,,9781955823005,https://covers.openlibrary.org/b/isbn/97819558...,<PIL.Image.Image image mode=RGB size=333x500 a...,<PIL.Image.Image image mode=RGB size=333x500 a...,"[[-0.153149337, 0.291618496, 0.133568466, -0.2...",0.662792
1352,52.0,1809690,709372,Dragon's Night,,['9781943728060'],2020,/books/OL28104276M,"['Fiction, gay', 'Fiction, fantasy, general']",[{'key': '/languages/eng'}],,,9781943728060,https://covers.openlibrary.org/b/isbn/97819437...,<PIL.Image.Image image mode=RGB size=333x500 a...,<PIL.Image.Image image mode=RGB size=333x500 a...,"[[0.0803963393, 0.511007786, 0.206290483, 0.00...",0.660096
5604,4.0,7346890,288520,Sun Also Rises,,['9781950330997'],2022,/books/OL36761013M,"['Fiction, general', 'Fiction, historical, gen...",[{'key': '/languages/eng'}],,,9781950330997,https://covers.openlibrary.org/b/isbn/97819503...,<PIL.Image.Image image mode=RGB size=314x500 a...,<PIL.Image.Image image mode=RGB size=314x500 a...,"[[-0.258123308, 0.00550228357, -0.0715105534, ...",0.660061
1002,2.0,1321197,625854,Saints and Sinners,,['9781639772520'],2022,/books/OL39181021M,"['Fiction, westerns']",[{'key': '/languages/eng'}],,,9781639772520,https://covers.openlibrary.org/b/isbn/97816397...,<PIL.Image.Image image mode=RGB size=313x500 a...,<PIL.Image.Image image mode=RGB size=313x500 a...,"[[-0.093682684, 0.125784934, 0.10113921, 0.330...",0.659017


Add Search API

In [20]:
# Install faiss, fastapi and uvicorn
!pip install faiss-cpu fastapi uvicorn

# Now, the import should work
from fastapi import FastAPI, Request
from transformers import CLIPProcessor, CLIPModel
import faiss
import torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.3-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.3-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.2-py3-none-any.whl (72 

In [21]:
# Ensure embeddings are in a suitable numpy format for FAISS
# If your embeddings are already numpy arrays with consistent dimensions, you might not need the .tolist()
embeddings_np = np.vstack(loaded_df['image_embeddings'].tolist()).astype('float32')

# Get the dimension of the embeddings
d = embeddings_np.shape[1]

# Build a simple IndexFlatL2 index (you can choose a different index type based on your needs)
index = faiss.IndexFlatL2(d)

# Add the embeddings to the index
index.add(embeddings_np)

# Save the index to a file
faiss.write_index(index, "book_index.faiss")

# Create a mapping from FAISS index position to your book key or another identifier
# This allows you to retrieve book information after finding similar items in the index
# For simplicity, we'll use the DataFrame index as the book ID in this example
book_id_map = loaded_df['key'].tolist()

print("FAISS index built and saved as book_index.faiss")

FAISS index built and saved as book_index.faiss


In [22]:
# Ensure embeddings are in a suitable numpy format for FAISS
# If your embeddings are already numpy arrays with consistent dimensions, you might not need the .tolist()
embeddings_np = np.vstack(loaded_df['image_embeddings'].tolist()).astype('float32')

# Get the dimension of the embeddings
d = embeddings_np.shape[1]

# Build a simple IndexFlatL2 index (you can choose a different index type based on your needs)
index = faiss.IndexFlatL2(d)

# Add the embeddings to the index
index.add(embeddings_np)

# Save the index to a file
faiss.write_index(index, "book_index.faiss")

# Create a mapping from FAISS index position to your book key or another identifier
# This allows you to retrieve book information after finding similar items in the index
# For simplicity, we'll use the DataFrame index as the book ID in this example
book_id_map = loaded_df['key'].tolist()

print("FAISS index built and saved as book_index.faiss")

FAISS index built and saved as book_index.faiss


In [23]:
app = FastAPI()
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
index = faiss.read_index("book_index.faiss")  # Load the pre-built FAISS index
book_id_map = [...]  # Map from FAISS index position to your book IDs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [24]:
@app.get("/api/search_books")
def search_books(q: str):
  pass

In [26]:
import faiss, pickle
import numpy as np
import pandas as pd # Import pandas

# 1) Load your embeddings and their corresponding book IDs from the pickled DataFrame.
with open('/content/drive/MyDrive/book_data/image_embeddings.pkl', 'rb') as f:
    loaded_df = pd.read_pickle(f) # Load the DataFrame

# Extract the embeddings column and convert to a numpy array
# Ensure the column 'image_embeddings' contains the actual numpy arrays for embeddings
# The previous step in the notebook already prepared this column.
# We need to stack the individual numpy arrays within the Series.
vectors  = np.vstack(loaded_df['image_embeddings'].values).astype('float32')

# Get the book identifiers (e.g., 'key' column) to map index results back to books
book_ids = loaded_df['key'].tolist() # Use the 'key' column for mapping

# 2) Build the index
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

# 3) Save to disk
faiss.write_index(index, 'book_index.faiss')
with open('book_id_map.pkl', 'wb') as f:
    pickle.dump(book_ids, f) # Save the list of book IDs

print("Saved book_index.faiss and book_id_map.pkl")

Saved book_index.faiss and book_id_map.pkl


In [28]:
import csv, pickle

key_title = {}
with open("/content/drive/MyDrive/book_data/recent_fiction.csv", newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        key = row["key"]         # e.g. "/books/OL123M"
        title = row["title"]     # e.g. "Moby Dick"
        key_title[key] = title

with open("key_title_map.pkl", "wb") as f:
    pickle.dump(key_title, f)