In [None]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import warnings
import numpy as np
from PIL import Image
import torch
from ultralytics.utils.metrics import mask_iou
import itertools
import cv2
import matplotlib.image as mpimg

# Ignore warnings
warnings.filterwarnings('ignore')
# pandas dataframe display
pd.set_option('display.max_columns', None)

In [None]:
anno = pd.read_csv('imaterialist-fashion-2020-fgvc7/train.csv')
with open('imaterialist-fashion-2020-fgvc7/label_descriptions.json', 'r') as file:
    labels = json.load(file)

categories = pd.DataFrame(labels['categories'])
attributes = pd.DataFrame(labels['attributes'])

In [None]:
anno.shape

In [None]:
anno.head()

In [None]:
categories

In [None]:
attributes.head(15)

In [None]:
attributes.tail(15)

In [None]:
attributes.supercategory.unique()

In [None]:
attributes.name.unique()

In [None]:
attributes.shape

In [None]:
anno = pd.merge(anno, categories[['id', 'name', 'supercategory']], left_on='ClassId', right_on=['id'], how='left')

In [None]:
anno.head(2)

## Table of Contents

### 0. Data Exploration
### 1. Understanding Types and Characteristics of Clothing (category & attributes)
- Understanding the scope of the dataset
### 2. Defining Relationships between Main/Sub Categories
- Preprocessing for more complete data
### 3. Converting Data for Recommendation Service Use
- Data transformation considering service connectivity
### 4. Preprocessing for Image Cropping
- Data normalization for efficient search

---

## 1. Understanding Types and Characteristics of Clothing (category & attributes)
#### : Examine what attributes each class has

In [None]:
id_to_name = pd.Series(attributes.name.values, index=attributes.id).to_dict()

# function to convert IDs to names
def ids_to_names(ids):
    if pd.isna(ids):
        return np.nan
    names = [id_to_name.get(int(id_), 'Unknown') for id_ in ids.split(',')]
    return ', '.join(names)

# Apply the function to the AttributesIds column
anno['AttributesNames'] = anno['AttributesIds'].apply(ids_to_names)

In [None]:
id_to_name[115]

In [None]:
anno.head()

In [None]:
anno_tmp = anno.copy()

# Convert attributes from string format to list format
anno_tmp['AttributesIds'] = anno_tmp['AttributesIds'].str.split(',')
anno_tmp['AttributesNames'] = anno_tmp['AttributesNames'].str.split(',')

In [None]:
# Extend attributes row-wise
exploded_df = anno_tmp.explode('AttributesIds')
# Select only unique pairs based on 'name', 'AttributesIds'
unique_pairs = exploded_df[['name', 'AttributesIds']].drop_duplicates()
unique_pairs.reset_index(drop=True, inplace=True)
unique_pairs = unique_pairs.loc[unique_pairs['AttributesIds'].notna()]
unique_pairs['AttributesIds'] = unique_pairs['AttributesIds'].astype(int)
unique_pairs.rename(columns={"name":"class_name"}, inplace=True)

In [None]:
exploded_df.head()

In [None]:
unique_pairs.head(20)

In [None]:
# Convert each attribute ID to human readable text
pairs = pd.merge(unique_pairs, attributes, left_on='AttributesIds', right_on='id', how='left')
pairs.sort_values(by=['class_name', 'id'], inplace=True)

In [None]:
pairs.head()

In [None]:
pairs.to_csv("clothes_pairs.csv", index=False)

In [None]:
for c in pairs.class_name.unique():
    print(c)

- Attributes are not clearly distinguished by class name

In [None]:
pairs.loc[pairs['class_name']=='pants', 'name'].values

In [None]:
pairs.loc[pairs['class_name']=='jacket', 'name'].values

- trucker (jacket)

<img src="https://www.billyreid.com/cdn/shop/files/f23_203-482_moleskin-tupelo-trucker-jacket_olive_201_A_3000x.jpg" width="300" height="300">

- houndstooth (pattern)

<img src="https://www.thecuttingclass.com/wp-content/uploads/2011/03/593bfb3e455e5_houndstooth.jpg" width="500" height="300">


## 2. Defining Relationships between Main/Sub Categories
#### : Examine annotations

- Run-Length Encoding (RLE) format
    - e.g.) 6068157 7 6073371 20 6078584 34
    - When 2D is converted to 1D, pixels from the 6068157th position are annotated for 7 pixels
    - Choice for efficient data storage

In [None]:
anno.head(2)

In [None]:
anno.loc[0, 'EncodedPixels']

In [None]:
tmp = anno.loc[anno['ImageId']=='2f18aaab685a98876504a0f32d4c1d8e']
tmp.reset_index(inplace=True, drop=True)

- At this time, labels referring to parts of clothing such as `garment parts` or `closures` are displayed as part of the annotation of other labels such as `shirt, blouse`

In [None]:
tmp

In [None]:
def create_single_mask(image, annoations, class_ids):
    masks = []
    shape = image.shape

    # Initialize numpy array with shape same as image size
    height, width = shape[:2]
    mask = np.zeros((height, width)).reshape(-1)

    # Iterate over encoded pixels and create mask
    for segment, (pixel_str, class_id) in enumerate(zip(annoations, class_ids)):
        splitted_pixels = list(map(int, pixel_str.split()))
        pixel_starts = splitted_pixels[::2]
        run_lengths = splitted_pixels[1::2]
        assert max(pixel_starts) < mask.shape[0]
        for pixel_start, run_length in zip(pixel_starts, run_lengths):
            pixel_start = int(pixel_start) - 1
            run_length = int(run_length)
            mask[pixel_start:pixel_start+run_length] = 255 - class_id * 4
    masks.append(mask.reshape((height, width), order='F'))
    return masks

In [None]:
def show_image_with_mask(image_path, mask):
    # Load the original image
    image = Image.open(image_path)
    
    # Plotting the original image
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title('Original Image')
    plt.axis('off')
    
    # Plotting the mask on top of the image
    plt.subplot(1, 2, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap='jet', alpha=0.5)  # Overlaying the mask with transparency
    plt.title('Image with Mask')
    plt.axis('off')
    
    plt.show()

In [None]:
image = mpimg.imread('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg')
masks = create_single_mask(image, tmp['EncodedPixels'], tmp['id'])

show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[0])

## 3. Converting Data for Recommendation Service Use

### The unit of clothing recommendation service is complete clothing items such as jackets, pants

- Pockets, sleeves, collars, etc. should be considered together with upper body categories such as shirts and blouses
    - Annotations of sub-categories (pockets, sleeves, collars) are made to overlap with annotations of main categories
- Also, even the same pocket, pants pockets and upper body pockets exist together in one image
- Therefore, through pair generation of sub and main categories, we should be able to consider the characteristics of the complete clothing

In [None]:
def create_separate_masks(annoations, class_ids, height, width):
    masks = []

    for segment, (pixel_str, class_id) in enumerate(zip(annoations, class_ids)):
        mask = np.zeros((height, width)).reshape(-1)
        splitted_pixels = list(map(int, pixel_str.split()))
        pixel_starts = splitted_pixels[::2]
        run_lengths = splitted_pixels[1::2]
        assert max(pixel_starts) < mask.shape[0]
        for pixel_start, run_length in zip(pixel_starts, run_lengths):
            pixel_start = int(pixel_start) - 1
            run_length = int(run_length)
            mask[pixel_start:pixel_start+run_length] = 1
        masks.append(mask.reshape((height, width), order='F'))
    return masks

In [None]:
masks = create_separate_masks(tmp['EncodedPixels'], tmp['id'], tmp['Height'].values[0], tmp['Width'].values[0])

In [None]:
masks[2].shape

In [None]:
print("Suit characteristics: ")
print(tmp.loc[8, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[8])

In [None]:
print("Jacket pocket characteristics: ")
print(tmp.loc[5, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[5])

In [None]:
print("Pants characteristics: ")
print(tmp.loc[9, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[9])

In [None]:
print("Pants pocket characteristics: ")
print(tmp.loc[10, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[10])

In [None]:
print("Belt characteristics: ")
print(tmp.loc[11, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[11])

In [None]:
print("Belt buckle characteristics: ")
print(tmp.loc[12, 'AttributesNames'])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[12])

### Creating Annotation pairs
1. Convert annotation to binary mask format
2. Search for the existence of a pair of annotations that overlap more than 90% within one image
3. Integrate based on the annotation with the larger range among the annotation pairs

In [None]:
def flatten_mask(mask):

    flattened_mask = mask.flatten()
    mask_tensor = np.reshape(flattened_mask, (1, -1))

    mask_tensor = torch.tensor(mask_tensor, dtype=torch.float32)
    return mask_tensor

def check_overlap(mask1, mask2, threshold=0.9):
    """
    Determine if the overlap between two masks covers more than `threshold` of the smaller mask.
    """
    # Calculate IoU using the mask_iou function
    iou = mask_iou(mask1, mask2).item()
    if iou==0:
        return False
    
    # Calculate the areas of the masks
    area1 = mask1.sum().item()
    area2 = mask2.sum().item()
    
    # Determine the smaller mask
    smaller_area = min(area1, area2)
    
    # Calculate the intersection area based on IoU and union
    intersection = iou * (area1 + area2) / (1 + iou)
    
    # Check if the intersection covers more than threshold of the smaller mask
    if intersection / smaller_area > threshold:
        return True
    else:
        return False

In [None]:
image = mpimg.imread('imaterialist-fashion-2020-fgvc7/train/{}.jpg'.format(tmp.ImageId.unique()[0]))
masks = create_separate_masks(tmp['EncodedPixels'], tmp['id'], tmp['Height'].values[0], tmp['Width'].values[0])

combinations = list(itertools.combinations(range(len(masks)), 2))

pairs = list()

# 모든 combination 고려
for comb in combinations:
    mask1 = masks[comb[0]]
    mask2 = masks[comb[1]]
    # 비교를 위해 flatten
    flat1 = flatten_mask(mask1)
    flat2 = flatten_mask(mask2)
    # 두 binary mask들 중 작은 mask가 큰 mask와 90% 이상 픽셀을 공유하는지 여부 체크
    if check_overlap(flat1, flat2):
        # 둘 중 큰 mask를 선별하여 대표 mask로 설정
        if mask1.sum() > mask2.sum():
            pairs.append([comb[0], comb])
        else:
            pairs.append([comb[1], comb])

In [None]:
pairs

In [None]:
print("Shirt")
tmp.loc[[0,1,2,3]]

In [None]:
print('Jacket')
tmp.loc[[4,5,6,7,8,16]]

In [None]:
print('Pants')
tmp.loc[[9, 10]]

- Add characteristics of `sub-items` such as sleeves and pockets to one column

In [None]:
attributes.tail(30)

In [None]:
tmp['second_AttributesIds'] = ''

In [None]:
tmp.head(1)

In [None]:
pairs

In [None]:
main_pairs

In [None]:
main_pairs = list(set([i[0] for i in pairs]))

for mp in main_pairs:
    pairb = [i[1] for i in pairs if i[0]==mp]
    print("Clothing related to clothing number {}: ".format(mp), pair)
    flat_pair = list(set([element for tuple_ in pair for element in tuple_]))
    sub_category = [i for i in flat_pair if i!=mp]
    print("Sub-category of clothing number {}: ".format(mp), sub_category)
    sub_attributes = tmp.loc[sub_category, 'AttributesIds'].values
    sub_attributes = list(set(','.join(sub_attributes).split(',')))
    sub_attributes = ','.join(sub_attributes)
    print("Sub-characteristic id of clothing number {}: ".format(mp), sub_attributes)
    tmp.loc[mp, 'second_AttributesIds'] = sub_attributes

    print("-"*20)


In [None]:
pair

In [None]:
flat_pair, sub_category

In [None]:
sub_attributes

In [None]:
tmp

In [None]:
len(combinations)

In [None]:
tmp.shape

### Apply to the entire dataset

- Refer to `00.preprocess_annotations.py`
- Execution method: `python 00.preprocess_annotations.py` (takes approximately 10 hours)

In [None]:
def search_attribute_pairs(tmp_df, image_base_path='imaterialist-fashion-2020-fgvc7/train'):
    tmp = tmp_df.reset_index(drop=True).copy()
    image = mpimg.imread(os.path.join(image_base_path, tmp.ImageId.unique()[0]+'.jpg'))
    # Create binary masks
    masks = create_separate_masks(tmp['EncodedPixels'], tmp['id'], tmp['Height'].values[0], tmp['Width'].values[0])

    combinations = list(itertools.combinations(range(len(masks)), 2))

    pairs = list()

    # Consider all combinations
    for comb in combinations:
        # Select binary masks
        mask1 = masks[comb[0]]
        mask2 = masks[comb[1]]
        # Flatten for comparison
        flat1 = flatten_mask(mask1)
        flat2 = flatten_mask(mask2)
        # Check if the smaller mask shares more than 90% of pixels with the larger mask
        if check_overlap(flat1, flat2):
            # Select the larger mask as the representative mask
            if mask1.sum() > mask2.sum():
                pairs.append([comb[0], comb])
            else:
                pairs.append([comb[1], comb])
    return pairs

In [None]:
def merge_attribute_pairs(tmp_df, pairs):
    tmp = tmp_df.reset_index(drop=True).copy()
    # Convert to string value since some cases may not have attributes
    tmp.loc[tmp['AttributesIds'].isna(), 'AttributesIds'] = ''
    main_pairs = list(set([i[0] for i in pairs]))

    for mp in main_pairs:
        # Select pairs containing the main category
        pair = [i[1] for i in pairs if i[0]==mp]
        # Select only other IDs excluding the main category == sub-categories
        flat_pair = list(set([element for tuple_ in pair for element in tuple_]))
        sub_category = [i for i in flat_pair if i!=mp]
        # Merge attributes of sub-categories into one
        sub_attributes = tmp.loc[sub_category, 'AttributesIds'].values
        sub_attributes = list(set(','.join(sub_attributes).split(',')))
        sub_attributes = ','.join(sub_attributes)
        # Save as second attribute of the main category
        tmp.loc[mp, 'second_AttributesIds'] = sub_attributes

    return tmp

In [None]:
# # 전체 데이터를 새로 만들 예정
# new_anno = pd.DataFrame()

# # 각 이미지 단위로 iterate
# for image in tqdm(anno['ImageId'].unique()):
#     # 한 이미지 단위 df
#     tmp_df = anno.loc[anno['ImageId']==image]
#     # pair 찾기
#     pairs = search_attribute_pairs(tmp_df)
#     if len(pairs)>0:
#         # attribute들을 합쳐서 update
#         tmp_df = merge_attribute_pairs(tmp_df, pairs)
#     # 새로운 df에 추가
#     new_anno = pd.concat([new_anno, tmp_df])

### read

In [None]:
file_path = 'outputs.json'

data = []

# Open the file for reading
with open(file_path, 'r') as file:
    for line in file:
        # Parse the JSON data from each line
        json_line = json.loads(line)
        
        # Optional: append the parsed JSON data to a list for further processing
        data.append(json_line)

In [None]:
pd.DataFrame(data[0])

In [None]:
pd.DataFrame(data[1])

In [None]:
data = [pd.DataFrame(data[i]) for i in range(len(data))]

In [None]:
anno = pd.concat(data, axis=0)

In [None]:
anno.head()

In [None]:
anno.shape

## 4. Preprocessing for Image Cropping
#### : Generate bounding boxes and crop images for individual storage

- 이때, 소매나 주머니 같은 하위 카테고리는 제거 (이미 관련 attribute을 상위카테고리의 second-attribute에 저장)

In [None]:
anno = anno.loc[anno['ClassId']<27]

In [None]:
anno.head()

In [None]:
anno.shape

1. mask
2. bbox

- 왜 bbox?
    - 주변 context까지 참고 할 수 있도록 이미지 crop

In [None]:
tmp = anno.loc[anno['ImageId']=='2f18aaab685a98876504a0f32d4c1d8e']

masks = create_separate_masks(tmp['EncodedPixels'], tmp['ClassId'], tmp['Height'].values[0], tmp['Width'].values[0])

show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/2f18aaab685a98876504a0f32d4c1d8e.jpg', masks[2])

In [None]:
def find_bounding_box(mask):
    """
    Find the bounding box of non-zero pixels in a mask.
    
    :param mask: The binary mask.
    :return: A tuple (x_min, y_min, x_max, y_max) representing the bounding box.
    """
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    y_min, y_max = np.where(rows)[0][[0, -1]]
    x_min, x_max = np.where(cols)[0][[0, -1]]
    
    return x_min, y_min, x_max, y_max

- 약 30분 소요

In [None]:
bboxes = list()

for i in tqdm(anno.ImageId.unique()):
    tmp = anno.loc[anno['ImageId']==i]
    masks = create_separate_masks(tmp['EncodedPixels'], tmp['ClassId'], tmp['Height'].values[0], tmp['Width'].values[0])
    bbox = [find_bounding_box(i) for i in masks]
    bboxes.extend(bbox)
    break

anno.reset_index(drop=True, inplace=True)

In [None]:
i

In [None]:
tmp

In [None]:
bbox

In [None]:
# 위 function이 오래 걸리기 때문에,
# 시간 관계상 미리 만들어 놓은 data frame 활용

def listify(string, encap_type="()"):
    return [int(num) for num in string.strip(encap_type).split(', ')]

anno = pd.read_csv("clothes_final.csv")

# 처음 읽을 때, pandas dataframe에서 list가 아닌 string 값으로 인식하기 때문에 변환 필요
anno['bbox'] = [listify(i) for i in anno['bbox']]
anno['bbox_big'] = [listify(i) for i in anno['bbox_big']]

In [None]:
anno.head(2)

In [None]:
def enlarge_bounding_box(bbox, img_shape, scale=0.05):
    x_min, y_min, x_max, y_max = bbox
    width = x_max - x_min
    height = y_max - y_min
    
    # Calculate enlargement
    enlarge_width = width * scale
    enlarge_height = height * scale
    
    # Apply enlargement
    x_min = max(0, x_min - enlarge_width // 2)
    y_min = max(0, y_min - enlarge_height // 2)
    x_max = min(img_shape[1], x_max + enlarge_width // 2)
    y_max = min(img_shape[0], y_max + enlarge_height // 2)
    
    return int(x_min), int(y_min), int(x_max), int(y_max)

In [None]:
bigger_bboxes = [enlarge_bounding_box(box, [h, w]) for box, h, w in zip(anno['bbox'], anno['Height'], anno['Width'])]

In [None]:
anno['bbox_big'] = bigger_bboxes

bbox가 알맞게 만들어졌는지 테스트

In [None]:
def draw_bounding_box(image, bbox, color=(0, 255, 0), thickness=20):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Draw the rectangle on the image
    cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color[::-1], thickness)  # Convert color to RGB
    
    # Display the image
    plt.imshow(image)
    plt.axis('off')  # Hide the axis

In [None]:
anno.head(2)

In [None]:
tmp = anno.loc[anno['ImageId']=="00000663ed1ff0c4e0132b9b9ac53f6e"]

In [None]:
masks = create_separate_masks(tmp['EncodedPixels'], tmp['ClassId'], tmp['Height'].values[0], tmp['Width'].values[0])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg', masks[0])

In [None]:
draw = draw_bounding_box(cv2.imread("imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg"), tmp['bbox'][0])

In [None]:
draw = draw_bounding_box(cv2.imread("imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg"), tmp['bbox_big'][0])

In [None]:
show_image_with_mask('imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg', masks[1])

In [None]:
draw = draw_bounding_box(cv2.imread("imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg"), tmp['bbox_big'][1])

In [None]:
def calculate_bbox_metrics(bbox):
    x_min, y_min, x_max, y_max = bbox
    width = x_max - x_min
    height = y_max - y_min
    area = width * height
    
    return {'width': width, 'height': height, 'area': area}

- bounding box 관련 정보 추가

In [None]:
anno.head(2)

In [None]:
bbox_metrics = [calculate_bbox_metrics(i) for i in anno['bbox_big']]

In [None]:
anno = pd.concat([anno, pd.DataFrame(bbox_metrics)], axis=1)

- attribute과 classId 이름 다시 붙이기

In [None]:
# category
anno = pd.merge(anno, categories[['id', 'name', 'supercategory']], left_on='ClassId', right_on=['id'], how='left')

In [None]:
# attributionId 전처리
def clean_attributes(attr_str):
    if isinstance(attr_str, float):
        return np.nan
    l = attr_str.split(',')
    l = [i for i in l if i != '']
    s = ','.join(l)
    if s=='':
        return np.nan
    else:
        return s
    

anno.loc[anno['AttributesIds']=='', 'AttributesIds'] = np.nan

anno['second_AttributesIds'] = anno['second_AttributesIds'].fillna(np.nan)
anno.loc[anno['second_AttributesIds']=='', 'second_AttributesIds'] = np.nan

In [None]:
anno['AttributesNames'] = [ids_to_names(i) for i in anno['AttributesIds']]

In [None]:
anno['second_AttributesIds'] = [clean_attributes(i) for i in anno['second_AttributesIds']]
anno['second_AttributesNames'] = [ids_to_names(i) for i in anno['second_AttributesIds']]

In [None]:
anno.head()

In [None]:
anno.head(2)

In [None]:
anno.to_csv("clothes_final.csv", index=False)

## 목차

### 0. 데이터 탐색
### 1. 옷의 종류와 특징 파악 (category & attributes)
- 데이터 셋의 범위 파악
### 2. 상위 / 하위 카테고리의 관계 정의
- 보다 온전한 데이터를 위한 전처리 작업
### 3. 추천 서비스에 활용할 단위의 데이터로 변환
- 서비스와의 연계성을 고려한 데이터 변환
### 4. 이미지 cropping을 위한 전처리
- 효율적인 search를 위한 데이터 정규화