# Creating Image Database

Have downloaded a file with training data from [here](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-small?select=images). This dataset has a number of images, with some additional tags for each image id. I think in reality, if deployed, this system would need to be a machine learning algorithmn, that scrapes and manages clothes. I am envisage a GPT that is tasked with seperating out clothes into different categories. Here is how the dataset is structured:

- id
- gender
- masterCategory
- subCategory
- articleType
- baseColour
- season
- usage

As well as having this metadata the database will contain embedded vectors relating to the image. My concern is that the data might be too low res, but we can see. 

In [1]:
# Add the src directory to the Python path
import sys
import os
from tqdm import tqdm

sys.path.append(os.path.abspath('../src'))

# Now you can import your models
from models import Image as ImageModel, Attribute, ItemAttribute, init_db

# Required Libraries
import clip
import torch
from PIL import Image as PILImage
import pandas as pd
import numpy as np
from sqlalchemy.orm import sessionmaker


In [2]:
# Initialize the Database
engine = init_db()
Session = sessionmaker(bind=engine)
session = Session()

# Load CLIP Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Paths
images_folder = os.path.join("../", "data", "raw", "images")
metadata_file = os.path.join("../", "data", "raw", "styles.csv")
embeddings_file = os.path.join("..", "data", "processed", "embeddings.npy")
image_ids_file = os.path.join("..", "data", "processed", "image_ids.npy")

# Load Metadata
metadata = pd.read_csv(metadata_file)



In [3]:
# Check for Missing Columns
required_columns = ['gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'use']
missing_columns = [col for col in required_columns if col not in metadata.columns]
if missing_columns:
    print(f"Missing columns in metadata: {missing_columns}")
    exit(1)

# Function to Generate Embedding for an Image
def generate_embedding(image_path):
    try:
        image = PILImage.open(image_path).convert("RGB")
        image = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model.encode_image(image).cpu().numpy().flatten()
        # Normalize the embedding
        embedding = embedding / np.linalg.norm(embedding)
        return embedding
    except Exception as e:
        print(f"Error generating embedding for {image_path}: {e}")
        return None

# Function to Get or Create Attribute ID
def get_or_create_attribute(session, name, value):
    instance = session.query(Attribute).filter_by(name=name, value=value).first()
    if not instance:
        instance = Attribute(name=name, value=value)
        session.add(instance)
        session.commit()
    return instance.id

In [4]:
# # Define the Dimension of the Embeddings
# d = 512  # Embedding dimension for CLIP ViT-B/32

# # Collect Embeddings and Metadata
# embeddings = []
# image_ids = []
# faiss_index = 0

# metadata = metadata.head(5000)

# # Wrap the loop with tqdm for progress tracking
# for idx, row in tqdm(metadata.iterrows(), total=metadata.shape[0], desc="Processing images"):
#     image_path = os.path.join(images_folder, str(row['id'])+".jpg")
#     image_path = os.path.normpath(image_path)  # Normalize the path to ensure consistency
#     if os.path.exists(image_path):
#         embedding = generate_embedding(image_path)
#         if embedding is not None:
#             embeddings.append(embedding)
#             image_ids.append(row['id'])
            
#             faiss_index += 1

In [5]:
# Define the Dimension of the Embeddings
d = 512  # Embedding dimension for CLIP ViT-B/32

# Collect Embeddings and Metadata
embeddings = []
image_ids = []
faiss_index = 0


# Wrap the loop with tqdm for progress tracking
for idx, row in tqdm(metadata.iterrows(), total=metadata.shape[0], desc="Processing images"):
    image_path = os.path.join(images_folder, str(row['id'])+".jpg")
    image_path = os.path.normpath(image_path)  # Normalize the path to ensure consistency
    if os.path.exists(image_path):
        embedding = generate_embedding(image_path)
        if embedding is not None:
            embeddings.append(embedding)
            image_ids.append(row['id'])

            # Insert Image Metadata into the Database
            image_instance = ImageModel(
                image_path=image_path,
                base_color=row.get('baseColour'),
                season=row.get('season'),
                article_type=row.get('articleType'),
                faiss_index=faiss_index
            )
            session.add(image_instance)
            session.commit()
            
            faiss_index += 1
            
            # Insert Attribute Data and Mapping
            for attribute in required_columns:
                value = row.get(attribute)
                attribute_id = get_or_create_attribute(session, attribute, value)
                item_attribute = ItemAttribute(image_id=image_instance.id, attribute_id=attribute_id)
                session.add(item_attribute)
            session.commit()


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Processing images: 100%|█████████████████████████████████████████████████████████| 44446/44446 [27:32<00:00, 26.90it/s]


In [6]:
# Convert Embeddings to Numpy Array and Save to Disk
embeddings = np.vstack(embeddings).astype('float32')
np.save(embeddings_file, embeddings)
np.save(image_ids_file, image_ids)

# Close the Session
session.close()