<a href="https://colab.research.google.com/github/hungryjins/Fashion_rec/blob/main/01.%20build_text_and_image_db(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import json

import os
import openai
from tqdm.notebook import tqdm

from image_utils import crop_bbox

In [None]:
df = pd.read_csv("clothes_final.csv")

In [None]:
df.head(3)

In [None]:
df['bbox'][0]

In [None]:
def listify(string, encap_type="()"):
    return [int(num) for num in string.strip(encap_type).split(', ')]

In [None]:
# It needs to be converted because pandas dataframe recognizes it as a string, not a list, when reading for the first time.
df['bbox'] = [listify(i) for i in df['bbox']]
df['bbox_big'] = [listify(i) for i in df['bbox_big']]

In [None]:
df.head(2)

📌 Table of Contents – CLIP Embeddings
Save cropped images of each product locally
Generate CLIP embeddings
1. Exploratory Data Analysis of Bounding Boxes

2. Crop each entity based on its bounding box

3. Resize the cropped images by category and save them locally

4. Generate embeddings using CLIP

Use a fine-tuned CLIP model
Represent both text and images in a unified embedding space

---

## 1. Bounding box EDA
- What is the 'size' of the products in each image?
- Similarity is an important factor because it is affected by image size.
- Therefore, it is important that images belonging to one category are all represented in the same size.

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/007e66e7c2864eb3c1ef95cd3ab52687.jpg")

In [None]:
img

In [None]:
df.loc[218]

In [None]:
cropped = crop_bbox(img, df['bbox'][218])
cropped

In [None]:
df.loc[223]

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/007e66e7c2864eb3c1ef95cd3ab52687.jpg")
cropped = crop_bbox(img, df['bbox'][223])
cropped

In [None]:
for cat in df['supercategory'].unique():
    tmp = df.loc[df['supercategory']==cat]
    print(cat)
    print(tmp['name'].unique())
    print("Area : {}, width : {}, height : {}".format(np.median(tmp['area']), np.median(tmp['width']), np.median(tmp['height'])))
    print("-"*10)


Each category has its own image characteristics
- lower body is average 410 horizontally, 540 vertically- upper body is longer vertically than lower body- wholebody is longer vertically than that
- waist is longer horizontally than vertically
- arms and hands have similar horizontal and vertical ratios and are generally small

## 2. Crop each entity based on the bounding box

In [None]:
size = {"lowerbody":[420, 540],
        "upperbody":[500, 700],
        "wholebody":[480, 880],
        "legs and feet":[100, 150],
        "head":[150, 100],
        "others":[200, 350],
        "waist":[200, 100],
        "arms and hands":[75, 75],
        "neck":[120, 200]}

In [None]:
df.head(3)

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg")
cropped = crop_bbox(img, df['bbox_big'][0])
cropped

#### image resize

In [None]:
from PIL import Image, ImageFilter

def resize_img(image, standard_size, category):
    w, h = image.size
    img_size = w*h

    new_width, new_height = standard_size[category]
    new_size = new_width * new_height

    if img_size >= new_size:
        # For downsizing
        downsized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        return downsized_image
    else:
        # For upsizing
        upsized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        upsized_image = upsized_image.filter(ImageFilter.UnsharpMask(radius=2, percent=150, threshold=3))
        return upsized_image

In [None]:
resize_img(cropped, size, df['supercategory'][0])

## 3. Resize and save cropped images locally according to each item

In [None]:
base_path = "imaterialist-fashion-2020-fgvc7/train"
cropped_path = "imaterialist-fashion-2020-fgvc7/cropped_images"
new_df = pd.DataFrame()

for image_name in tqdm(df['ImageId'].unique()):
    # 한 이미지와 관련된 dataframe
    tmp = df.loc[df['ImageId']==image_name]
    tmp = tmp.reset_index().rename(columns={"index":"entity_id"})
    image = Image.open(os.path.join(base_path, image_name+".jpg"))
    # 각 이미지 내에 있는 상품들을 crop -> local save
    for idx, row in tmp.iterrows():
        cropped_img = crop_bbox(image, row['bbox_big'])
        resized_img = resize_img(cropped_img, size, row['supercategory'])
        resized_img.save(os.path.join(cropped_path, image_name + "_" + str(row['entity_id']) + ".jpg"))

    new_df = pd.concat([new_df, tmp], axis=0)

In [None]:
# new_df.to_csv("clothes_final2.csv", index=False)

In [None]:
new_df = pd.read_csv("clothes_final2.csv")

new_df['bbox'] = [listify(i, "[]") for i in new_df['bbox']]
new_df['bbox_big'] = [listify(i, "[]") for i in new_df['bbox_big']]

In [None]:
df.head(2)

In [None]:
new_df.head()

## 4.Embedding using CLIP

- fashion dataset used for pretraining the CLIP model
- The CLIP model uses <image>-<text> pairs as input data, representing both in a single embedding space.
- Therefore, a model fine-tuned using <fashion image>-<fashion caption> pairs is suitable for the current project purpose.
- Dot product will be used to measure embedding ranking.
```json
"FashionCLIP performs the dot product between the input caption embedding and each image vector embedding"

"The text used is a concatenation of the highlight (e.g., “stripes”, “long sleeves”, “Armani”) and short description (“80s styled t-shirt”)) available in the Farfetch dataset."
```

![Fine-tune 훈련 데이터](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fs41598-022-23052-9/MediaObjects/41598_2022_23052_Fig3_HTML.png?as=webp, "Fine-tune training data")

( Contrastive language and vision learning of general fashion concepts)

- hugging face : https://huggingface.co/patrickjohncyh/fashion-clip
- paper : https://www.nature.com/articles/s41598-022-23052-9

#### F-CLIP VS CLIP

https://www.nature.com/articles/s41598-022-23052-9/tables/1



In [None]:
new_df.head()

In [None]:
from transformers import CLIPProcessor, CLIPModel

model_name = "patrickjohncyh/fashion-clip"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

In [None]:
# crop된 이미지들의 path 불러오기
cropped_path = "imaterialist-fashion-2020-fgvc7/cropped_images"

images = list(os.walk(cropped_path))[0][2]

In [None]:
images[:3]

image embeddings from CLIP

In [None]:
from image_utils import extract_img_features

img_emb = extract_img_features(img, processor, model)

In [None]:
img_emb.shape

text embeddings from CLIP

In [None]:
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer

model_name = "patrickjohncyh/fashion-clip"

model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def get_single_text_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors = "pt", padding=True)
    text_embeddings = model.get_text_features(**inputs)
    # convert the embeddings to numpy array
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    return embedding_as_np.tolist()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/3bccf2e618d8f5f51442037ad3c8d4fb.jpg")
img

fashion fine-tuned model

```json
"The text used is a concatenation of the highlight (e.g., “stripes”, “long sleeves”, “Armani”) and short description (“80s styled t-shirt”)) available in the Farfetch dataset."
```

In [None]:
img_emb = extract_img_features(img, processor, model)

sample_texts = ['tshirt', "formal suit and tie",
                'a woman', "a lion in a cage", "black top short sleeves",
                'black shirt with check patterns, topwear', 'iphone']

sample_texts_emb = get_single_text_embedding(sample_texts, model, tokenizer)

sims = cosine_similarity(img_emb.cpu().detach().numpy(), sample_texts_emb)
# Although dot product will be used in the future,
print("Similarity with image")
for t, s in zip(sample_texts, sims[0]):
    print("{} : {}".format(t, s))
    print()

In [None]:
img_emb.cpu().detach().numpy()[0].shape

In [None]:
np.array(s).shape

In [None]:
print('dot product')
for text, s in zip(sample_texts, sample_texts_emb):
    sim = np.dot(img_emb.cpu().detach().numpy()[0], np.array(s))
    print(text, sim)

---

In [None]:
embeddings = {}

with open('img_embeddings_fashion_fine_tuned.json', 'r') as file:
    for line in file:
        # Convert each line to a dictionary
        embedding_dict = json.loads(line.strip())

        # Convert the list back to a NumPy array if necessary
        for img_name, emb_list in embedding_dict.items():
            embeddings[img_name] = np.array(emb_list)

In [None]:
len(embeddings)

In [None]:
type(embeddings)

In [None]:
for k,v in embeddings.items():
    print(k)
    break

In [None]:
v.shape

In [None]:
"FashionCLIP performs the dot product between the input caption embedding and each image vector embedding"

"The text used is a concatenation of the highlight (e.g., “stripes”, “long sleeves”, “Armani”) and short description (“80s styled t-shirt”)) available in the Farfetch dataset."

```
fashion fine-tuned model
```

```

```

* * *

```