In [None]:
import pandas as pd
import torch
from torchvision.transforms import ToTensor
import cv2
import torch
from sklearn.cluster import DBSCAN
from tensorflow.keras.utils import load_img
import numpy as np
import matplotlib.pyplot as plt
import os
from torchvision.transforms import ToTensor, Compose, Resize
from math import ceil
from tensorflow.keras.applications import MobileNet
from PIL import Image, UnidentifiedImageError
import tensorflow as tf
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.layers import Flatten


In [None]:
model = MobileNet(weights="imagenet", include_top=False, pooling="avg")

In [None]:
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

In [None]:
car_data= pd.read_csv('/kaggle/input/cardata-dir/CarDataset.csv')

In [None]:
col_name= ['Dir', 'Category']
car_data.columns= col_name

In [None]:
def display_cluster_images(cluster_id, image_paths, brand , n_cols=5):
    print(f"\nCluster {cluster_id}:")
    n_images = len(image_paths)
    n_rows = ceil(n_images / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 3 * n_rows))
    axes = axes.flatten()  # Đưa về danh sách để dễ truy cập

    for i, image_path in enumerate(image_paths):
        try:
            image_path = '/kaggle/input/ml-data/Data/'+brand+'/'+image_path
            img = load_img(image_path)
            axes[i].imshow(img)
            axes[i].set_title(os.path.basename(image_path), fontsize=8)
            axes[i].axis("off")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")

    # Ẩn các ô thừa (nếu số ảnh không đủ để lấp đầy hàng cuối)
    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    plt.show()

In [None]:
images = car_data['Dir']
cars_brand_set = ('Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast', 'Others')
features = []
images_dir = '/kaggle/input/ml-data/Data'
batch_size = 8
transform = Compose([
    Resize((224,224)),
    ToTensor()
])

all_results = []
flatten = Flatten()
for brand in cars_brand_set:
    features = []
    valid_paths = []
    file_dir = os.path.join(images_dir, brand)
    print(f'Processing brand: {brand}')
    for file in os.listdir(file_dir):
        img_dir = images_dir+'/'+brand+'/' +file
        # img = Image.open(img_dir)
        try:
            img = Image.open(img_dir)
              # Chuyển đổi sang RGB nếu thành công
        except UnidentifiedImageError:
            print(f"Cannot identify image file: {img_dir}")
            continue
        if img.mode == "P":
            img = img.convert("RGB")
        elif img.mode == "RGBA":
            img = img.convert("RGB")
        else:
            img = img.convert("RGB")
            
        img = img.resize((224, 224))
        img_array = np.array(img)  # Chuyển sang NumPy array
        img_array = preprocess_input(img_array)  # Tiền xử lý
        img_array = np.expand_dims(img_array, axis=0)
            
        
        feature = model(img_array)
        feature = tf.keras.layers.Flatten()(feature)
        if feature is not None:
            feature = tf.squeeze(feature, axis=0)
            features.append(feature)
            valid_paths.append(file)
    features = np.array(features)
    print(features.shape)
    dbscan = DBSCAN(eps=5, min_samples=2, metric='euclidean')  # eps có thể cần điều chỉnh
    clusters = dbscan.fit_predict(features)

    cluster_df = pd.DataFrame({
        "ImagePath": valid_paths,
        "ClusterID": clusters
    })
    duplicate_clusters = cluster_df[cluster_df["ClusterID"] != -1].groupby("ClusterID")
    
    for cluster_id, group in duplicate_clusters:
        cluster_images = group["ImagePath"].tolist()
        print(f"\nCluster {cluster_id} (Directory: {brand}):")
        print("\n".join(cluster_images))
        display_cluster_images(cluster_id, cluster_images, brand)

        # Ghi kết quả vào danh sách
        for img_path in cluster_images:
            all_results.append({
                "Directory": brand,
                "ClusterID": cluster_id,
                "ImagePath": img_path
            })

In [None]:
import pandas as pd

# Giả sử bạn đã có DataFrame chứa kết quả
result_df = pd.DataFrame(all_results)

# Đường dẫn lưu file trên Kaggle
output_csv = "/kaggle/working/DuplicateDetectionResults.csv"

# Lưu DataFrame thành file CSV
result_df.to_csv(output_csv, index=False)

print(f"\nDuplicate detection results saved to {output_csv}")