In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
paramaggarwal_fashion_product_images_small_path = kagglehub.dataset_download('paramaggarwal/fashion-product-images-small')

print('Data source import complete.')


In [None]:
import glob
import cv2
import os

def img_paths(folder):
    """
    Trả về list đường dẫn ảnh hợp lệ trong folder
    """
    all_files = glob.glob(os.path.join(folder, "*"))

    valid_img_paths = []
    for path in all_files:
        img = cv2.imread(path)
        if img is not None:
            valid_img_paths.append(path)

    return valid_img_paths


In [None]:
import cv2
import numpy as np

def mask_and_crop_by_background(path, output_size=(256, 256)):
    """
    Input:
        path: đường dẫn ảnh
        output_size: size sau resize (w, h)
    Output:
        crop_img_resized hoặc None
    """

    img = cv2.imread(path)
    if img is None:
        return None

    # 1. Denoise (giữ biên)
    bilateral = cv2.bilateralFilter(img, 9, 75, 75)

    # 2. HSV
    hsv = cv2.cvtColor(bilateral, cv2.COLOR_BGR2HSV)

    # 3. Mask nền trắng / xám
    lower_background = np.array([0, 0, 200])
    upper_background = np.array([180, 50, 255])
    mask_background = cv2.inRange(hsv, lower_background, upper_background)

    # 4. Đảo mask → foreground = trắng
    mask_foreground = cv2.bitwise_not(mask_background)

    # 5. Morphological Transformations
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))

    opened = cv2.morphologyEx(mask_foreground, cv2.MORPH_OPEN, kernel)
    out = cv2.morphologyEx(opened, cv2.MORPH_CLOSE, kernel)

    # 6. Find contours
    contours, _ = cv2.findContours(
        out,
        cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_SIMPLE
    )

    if len(contours) == 0:
        return None

    # 7. Largest contour
    max_contour = max(contours, key=cv2.contourArea)

    x, y, w, h = cv2.boundingRect(max_contour)

    # 8. Crop
    crop_img = bilateral[y:y+h, x:x+w]

    if crop_img.size == 0:
        return None


    return crop_img


In [None]:
paths = img_paths('/kaggle/input/fashion-product-images-small/images')
print(paths[:5])

In [None]:
import matplotlib.pyplot as plt
import cv2

check01 = paths[0]
img = cv2.imread(check01)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

plt.imshow(img)
plt.axis('off')


In [None]:
def color_hist(img, h_bins=16, s_bins=16):
    """
    HSV Color Histogram (H, S)
    """
    if img is None:
        return None

    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # Tính histogram cho kênh Hue (Màu) và Saturation (Độ bão hòa)
    # Bỏ qua kênh Value (Độ sáng) để chống lại sự thay đổi ánh sáng
    hist = cv2.calcHist(
        [hsv],
        channels=[0, 1],
        mask=None,
        histSize=[h_bins, s_bins],
        ranges=[0, 180, 0, 256]
    )

    # Sử dụng Normalize L1 (Tổng xác suất = 1) thay vì L2
    cv2.normalize(hist, hist, alpha=1.0, norm_type=cv2.NORM_L1)

    return hist.flatten()

print(color_hist(mask_and_crop_by_background(check01)).shape)

# FAISS

In [None]:
valid_imgs = []
valid_hists = []
valid_paths = []

for path in paths:
    img = mask_and_crop_by_background(path)
    if img is None:
        continue

    hist = color_hist(img)

    valid_imgs.append(img)
    valid_hists.append(hist)
    valid_paths.append(path)

features = np.array(valid_hists)  # shape (N, 256)

print(features.shape)


In [None]:
from sklearn.cluster import KMeans

K = 10  # ví dụ, bạn có thể điều chỉnh

kmeans = KMeans(
    n_clusters=K,
    init="k-means++",
    n_init=10,
    max_iter=300,
    random_state=42
)

labels = kmeans.fit_predict(features)
centroids = kmeans.cluster_centers_

print(centroids.shape)

In [None]:
from collections import defaultdict

cluster_data = defaultdict(list)

for idx, label in enumerate(labels):
    cluster_data[label].append({
        "path": valid_paths[idx],
        "hist": features[idx],
        "img": valid_imgs[idx]
    })

In [None]:
plt.imshow(cv2.cvtColor(cluster_data[2][8]['img'], cv2.COLOR_BGR2RGB))

In [None]:
def chi2_distance(h1, h2, eps=1e-10):
    return np.sum((h1 - h2) ** 2 / (h1 + h2 + eps))

In [None]:
def search_image(query_path, centroids, cluster_data, m_centroids=3, top_k=5):

    # --- preprocess query ---
    query_img = mask_and_crop_by_background(query_path)
    if query_img is None:
        return []

    query_hist = color_hist(query_img)

    # --- STEP 1: distance to centroids ---
    centroid_scores = []
    for i, centroid in enumerate(centroids):
        d = np.linalg.norm(query_hist - centroid)
        centroid_scores.append((d, i))

    centroid_scores.sort(key=lambda x: x[0])
    selected_clusters = [idx for _, idx in centroid_scores[:m_centroids]]   # Lấy m-centroids tốt nhất

    # --- STEP 2: search inside selected clusters ---
    results = []

    for cid in selected_clusters:
        for item in cluster_data[cid]:
            d = chi2_distance(query_hist, item["hist"])
            results.append((d, item["path"], item["img"]))

    # --- STEP 3: rank ---
    results.sort(key=lambda x: x[0])

    return results[:top_k]  # top_k ảnh tốt nhất


In [None]:
import cv2
import matplotlib.pyplot as plt

# Giả sử bạn đã có sẵn các biến paths, centroids, cluster_data, etc.

query_path = '/kaggle/input/fashion-product-images-small/images/10009.jpg'

# Tìm kiếm kết quả theo query
results = search_image(
    query_path=query_path,
    centroids=centroids,
    cluster_data=cluster_data,
    m_centroids=5,
    top_k=5
)

# Hiển thị ảnh query
query_img = cv2.imread(query_path)
query_img_rgb = cv2.cvtColor(query_img, cv2.COLOR_BGR2RGB)  # Chuyển sang RGB để Colab hiển thị đúng màu
plt.imshow(query_img_rgb)
plt.axis('off')
plt.title(f"Query Image: {query_path}")
plt.show()

# Hiển thị các kết quả tìm kiếm (img từ search_image)
for score, path, img in results:
    print(score, path)
    result_img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Chuyển sang RGB
    plt.imshow(result_img_rgb)
    plt.axis('off')
    plt.show()

# Hoặc nếu bạn muốn load lại img từ path:
for score, path, img in results:
    print(score, path)
    img_from_path = cv2.imread(path)
    img_from_path_rgb = cv2.cvtColor(img_from_path, cv2.COLOR_BGR2RGB)  # Chuyển sang RGB
    plt.imshow(img_from_path_rgb)
    plt.axis('off')
    plt.show()
