In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
import hashlib

In [2]:
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [22]:
DATASET_PATH = "D:/KULIAH/SEMESTER 7/dataset-resized"

In [4]:
# 1. LOAD DATASET

# Ambil daftar kelas
classes = sorted([d for d in os.listdir(DATASET_PATH) 
                  if os.path.isdir(os.path.join(DATASET_PATH, d))])

In [5]:
# Load semua path gambar dan labelnya
image_paths = []
labels = []
extensions = []

for class_name in classes:
    class_path = os.path.join(DATASET_PATH, class_name)
    images = [f for f in os.listdir(class_path) 
              if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    for img in images:
        image_paths.append(os.path.join(class_path, img))
        labels.append(class_name)
        extensions.append(os.path.splitext(img)[1].lower())

In [6]:
# Buat DataFrame
df = pd.DataFrame({
    'image_path': image_paths,
    'label': labels,
    'extension': extensions
})

In [7]:
# Hitung jenis file
ext_counts = df['extension'].value_counts()

print(f"Dataset berhasil dimuat!")
print(f"   Total gambar: {len(df)}")
print(f"   Jumlah kelas: {len(classes)}")
print(f"   Nama kelas: {classes}")
print(f"\n   Jenis file gambar:")
for ext, count in ext_counts.items():
    percentage = (count / len(df)) * 100
    print(f"      {ext:6} : {count:4} gambar ({percentage:.1f}%)")

Dataset berhasil dimuat!
   Total gambar: 2527
   Jumlah kelas: 6
   Nama kelas: ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']

   Jenis file gambar:
      .jpg   : 2527 gambar (100.0%)


In [8]:
all_files = []
for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        all_files.append(file)

# Hitung duplikat
duplicates = [item for item, count in Counter(all_files).items() if count > 1]

print("Jumlah file duplikat berdasarkan nama:", len(duplicates))
print("Daftar file duplikat:", duplicates)

Jumlah file duplikat berdasarkan nama: 0
Daftar file duplikat: []


In [18]:
hashes = {}
duplicates = []

def hash_file(path):
    with open(path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        file_path = os.path.join(root, file)
        file_hash = hash_file(file_path)

        if file_hash in hashes:
            duplicates.append((file_path, hashes[file_hash]))
        else:
            hashes[file_hash] = file_path

print("Jumlah gambar duplikat berdasarkan konten:", len(duplicates))
for dup in duplicates:
    print("Duplikat:", dup)

Jumlah gambar duplikat berdasarkan konten: 3
Duplikat: ('D:/KULIAH/SEMESTER 7/dataset-resized\\metal\\metal91.jpg', 'D:/KULIAH/SEMESTER 7/dataset-resized\\glass\\glass115.jpg')
Duplikat: ('D:/KULIAH/SEMESTER 7/dataset-resized\\plastic\\plastic152.jpg', 'D:/KULIAH/SEMESTER 7/dataset-resized\\glass\\glass176.jpg')
Duplikat: ('D:/KULIAH/SEMESTER 7/dataset-resized\\plastic\\plastic332.jpg', 'D:/KULIAH/SEMESTER 7/dataset-resized\\glass\\glass389.jpg')


In [19]:
import os

# Daftar file yang SALAH (akan dihapus)
wrong_files = [
    r"D:/KULIAH/SEMESTER 7/dataset-resized/metal/metal91.jpg",
    r"D:/KULIAH/SEMESTER 7/dataset-resized/glass/glass176.jpg",
    r"D:/KULIAH/SEMESTER 7/dataset-resized/plastic/plastic332.jpg"
]

for file_path in wrong_files:
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted wrong duplicate: {file_path}")
    else:
        print(f"File not found: {file_path}")

Deleted wrong duplicate: D:/KULIAH/SEMESTER 7/dataset-resized/metal/metal91.jpg
Deleted wrong duplicate: D:/KULIAH/SEMESTER 7/dataset-resized/glass/glass176.jpg
Deleted wrong duplicate: D:/KULIAH/SEMESTER 7/dataset-resized/plastic/plastic332.jpg
