In [None]:
import json
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)

## Table of Contents

### 1. Building a VectorDB

### 2. Text search

### 3. Image search

### 4. Hybrid search

---

#### Creating a Local DB

In [None]:
attributes = pd.read_csv("attribute_specific.csv")
df = pd.read_csv("clothes_final2.csv")

In [None]:
data_read_f = list()

with open("upsert_vectors_fashion_fine_tuned.json", 'r') as file:
    for line in file:
        data = json.loads(line)
        data_read_f.append(data)

print(f"Successfully read {len(data_read_f)} fashion-fine-tuned CLIP embeddings from img_embeddings_fashion_fine_tuned.json")

In [None]:
df['vdb_id'] = df['ImageId'].astype(str) + "_" + df['entity_id'].astype(str)
df.drop(columns=['id'], inplace=True)

In [None]:
upsert_df_f = pd.DataFrame(data_read_f)

In [None]:
upsert_df_f.head(2)

In [None]:
d = pd.merge(df, upsert_df_f, left_on='vdb_id', right_on='id')

In [None]:
# d.to_csv("local_db.csv", index=False)

In [None]:
d.head(2)

In [None]:
metadata = d['metadata'].values
names = d['name'].values

In [None]:
metadata_new = list()

for n,m in zip(names, metadata):
    m['category'] = n
    metadata_new.append(m)

d['metadata'] = metadata_new

In [None]:
m

In [None]:
d.head(2)

### Uploading to PineconeDB

- Convert the content to match the Pinecone upsert format.
    - Upsert each category according to the batch size.

In [None]:
## Upsert to pineconeDB!!
from pinecone import Pinecone

pc = Pinecone(api_key="YOUR_PINECONE_API_KEY")
# Check the number of indexes
# index_list = pc.list_indexes().indexes

# index description
index = pc.Index("fastcampus")
index.describe_index_stats()

- Max size for an upsert request is 2MB. Recommended upsert limit is 100 vectors per request.

In [None]:
# Stored separately for each category
# This is to save the index individually later

# upsert!!
def create_batches(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

df_categories = dict()

for cat in tqdm(d['name'].unique()):
    part_df = d.loc[d['name']==cat]
    part_upserts = part_df[['id', 'values', 'sparse_values', 'metadata']].to_dict('records')
    # Upsert in units of 100
    df_categories[cat] = list(create_batches(part_upserts, 100))

In [None]:
df_categories.keys()

### Upsert Format

```json
{"id" : "0838a48a7b0bfa789a5181ab0e8f4ee2_3040", # Image file name + entity ID
 "values" : [-0.08405803143978119, -0.7088879346847534, ...], # CLIP embeddings
 "sparse_values" : {
    "indices" : [1045, 1062, ...], # non-zero index
    "values" : [1.3038887977600098, 0.304147332906723, ...] # non-zero values
    },
"metadata" : {
    # Image file path
    "img_path": "imaterialist-fashion-2020-fgvc7/cropped_images/0838a48a7b0bfa789a5181ab0e8f4ee2_3040.jpg",
    "category": "coat"
} 
}

```

In [None]:
# for cat, batches in df_categories.items():
#     print(cat)
#     for batch in tqdm(batches):
#         index.upsert(vectors=batch)

- Save each category to a separate index

---

# 1. Text to image search

- Utilizing CLIP embedding
    - Text and images are represented together in one vector space.
    - Also, it is fine-tuned for the fashion dataset, making it more suitable for the current use case than a plain CLIP.
    - The fine-tuned data is also trained based on various attributes of clothing (refer to the data below).

![Fine-tune training data](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fs41598-022-23052-9/MediaObjects/41598_2022_23052_Fig3_HTML.png?as=webp, "Fine-tune training data")

(Source: Contrastive language and vision learning of general fashion concepts)

In [None]:
from PIL import Image
import os
import json
from tqdm import tqdm
import numpy as np
from image_utils import fetch_clip, draw_images
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer

In [None]:
from search_utils import gen_sparse_vector

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

splade_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
splade_model = Splade(splade_model_id, agg='max')
splade_model.to('cpu')  # move to GPU if possible
splade_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(splade_model_id)

In [None]:
model, processor, tokenizer = fetch_clip(model_name="patrickjohncyh/fashion-clip")

In [None]:
def get_single_text_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors = "pt", padding=True)
    text_embeddings = model.get_text_features(**inputs)
    # convert the embeddings to numpy array
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    return embedding_as_np.tolist()

In [None]:
input_text = "Green dress with blue dots, long sleeve"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "nike"

# vans, nike, addidas

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "shoe"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "street fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Bohemian Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "flower patterns, short sleeve"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- Advantages
    - Categories such as famous brands, gender, clothing type, color, etc. can be specified as input without being manually defined.

- Limitations
    - Since it is a simple combination of attributes, it cannot recognize the characteristics of each part of the clothing.
        - e.g.) Although "blue dots" was specified, a blue dress was expressed in the similarity.
    - Abstract words such as street and bohemian fashion are combinations of various clothes.
    (CLIP is trained using <clothing feature>-<clothing photo> pairs. Therefore, it does not match a category of fashion like "street fashion")

- Overcoming measures
    - Search that gives more weight to the characteristics of clothing by utilizing sparse vectors.
    - If more abstract text is entered instead of clothing features, search the entire database.

---

# 2. Image to image search

- Utilizing CLIP embedding
    - Although text and images are represented together in one vector space, Image-to-Image similarity measurement is possible.
    - Also, it is fine-tuned for the fashion dataset, making it more suitable for the current use case than a plain CLIP.

In [None]:
from image_utils import extract_img_features

In [None]:
image

In [None]:
image = Image.open("test_images/test_image2.jpg")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb[0],
    top_k=5,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
image

In [None]:
image = Image.open("test_images/test_image.png")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb[0],
    top_k=5,
    filter={"category": {"$eq": "shirt, blouse"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- Limitations
    - Since an image contains various elements such as the color of the clothes, people's poses, and light, it is not possible to select only the features of the clothes and conduct a search.
    - In other words, there is a high possibility of overlooking the details of the clothes.
- Overcoming measures
    - Extract the features of the clothes from the image in text format, and convert them to a dense or sparse vector for searching.

# 3. Hybrid search (Dense & sparse vector search)

- Considering the characteristics of each part by utilizing splade

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer
import torch

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [None]:
# v-neck
input_text = "orange party dress with long sleeve, v neck"

d = get_single_text_embedding(input_text, model, tokenizer)
# sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    # sparse_vector=sparse,
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
# v-neck
input_text = "orange party dress with long sleeve, v neck"

d = get_single_text_embedding(input_text, model, tokenizer)
sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    sparse_vector=sparse,
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

[i['id'] for i in result.matches]

df.loc[df['vdb_id'].isin([i['id'] for i in result.matches]), ['vdb_id', 'ImageId', 'AttributesNames', 'second_AttributesNames']]

In [None]:
[i['id'] for i in result.matches]

Since there is no text field related to fashion style, it is difficult to expect a significant performance improvement even if a sparse vector is used.

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)
# sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    # sparse_vector=sparse,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)
sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    sparse_vector=sparse,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- Types of attributes that can be used
```python
list_of_attributes = ['main_category', 'silhouette', 'silhouette_fit', 'waistline', 'length',
       'collar_type', 'neckline_type', 'sleeve_type', 'pocket_type',
       'opening_type', 'non-textile material type', 'leather',
       'textile finishing, manufacturing techniques', 'textile pattern']
```
<br>

- Format of the document that can be expressed with attributes

```json
silhouette_name : symmetrical,
collar_type_name : shirt (collar),
opening_type_name : single breasted,
non-textile material type_name : no non-textile material,
textile finishing, manufacturing techniques_name : no special manufacturing technique,
textile pattern_name : plain (pattern)

```

In [None]:
image = Image.open("test_images/test_image4.png")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb,
    top_k=5,  # how many results to return
    filter={"category": {"$eq": "jacket"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
image

In [None]:
image = Image.open("test_images/test_image4.png")

img_emb = extract_img_features(image, processor, model).tolist()

sparse_vector = gen_sparse_vector("suede jacket", sparse_model, splade_tokenizer)

result = index.query(
    vector=img_emb,
    sparse_vector=sparse_vector,
    top_k=5,  # how many results to return
    filter={"category": {"$eq": "jacket"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])