In [7]:
import os
import tensorflow as tf

DATASET_PATH = r"C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set"

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".gif"}

bad_files = []

for root, _, files in os.walk(DATASET_PATH):
    for f in files:
        ext = os.path.splitext(f)[1].lower()
        if ext not in VALID_EXTS:
            continue  # skip non-image extensions

        fp = os.path.join(root, f)

        # quick check: skip 0-byte files
        try:
            if os.path.getsize(fp) == 0:
                bad_files.append((fp, "0-byte file"))
                continue
        except OSError:
            bad_files.append((fp, "os error reading size"))
            continue

        # decode test
        try:
            raw = tf.io.read_file(fp)
            img = tf.io.decode_image(raw, channels=3, expand_animations=False)
            _ = tf.shape(img)  # force execution
        except Exception as e:
            bad_files.append((fp, str(e)))

print(f"Total bad files found: {len(bad_files)}")
for fp, err in bad_files[:50]:
    print("BAD:", fp)
    print("ERR:", err)
    print("-" * 60)

# Optional: automatically delete the bad files (uncomment to use)
# for fp, _ in bad_files:
#     try:
#         os.remove(fp)
#     except:
#         pass
# print("Deleted bad files.")


Total bad files found: 315
BAD: C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set\cardboard\cardboard 1075.jpg
ERR: {{function_node __wrapped__DecodeImage_device_/job:localhost/replica:0/task:0/device:CPU:0}} Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeImage] name: 
------------------------------------------------------------
BAD: C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set\cardboard\cardboard 1115.jpg
ERR: {{function_node __wrapped__DecodeImage_device_/job:localhost/replica:0/task:0/device:CPU:0}} Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeImage] name: 
------------------------------------------------------------
BAD: C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set\cardboard\cardboard 1167.jpg
ERR: {{function_node __wrapped__DecodeImage_device_/job:localhost/replica:0/task:0/device:CPU:0}} Unknown image file format. One of JPEG, PNG, GIF, BMP 

In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras import layers

DATASET_PATH = r"C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set"

IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
SEED = 42

train_ds = image_dataset_from_directory(
    DATASET_PATH,
    validation_split=0.2,
    subset="training",
    seed=SEED,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int"
)

val_ds = image_dataset_from_directory(
    DATASET_PATH,
    validation_split=0.2,
    subset="validation",
    seed=SEED,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int"
)

print("Detected Classes:", train_ds.class_names)

# ✅ IMPORTANT: skip corrupted/undecodable images instead of crashing
train_ds = train_ds.apply(tf.data.experimental.ignore_errors())
val_ds   = val_ds.apply(tf.data.experimental.ignore_errors())

# Normalize
normalization_layer = layers.Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y), num_parallel_calls=tf.data.AUTOTUNE)
val_ds   = val_ds.map(lambda x, y: (normalization_layer(x), y), num_parallel_calls=tf.data.AUTOTUNE)

# Augmentation (train only)
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
    layers.RandomContrast(0.2),
    layers.RandomBrightness(0.2),
])

train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y),
                        num_parallel_calls=tf.data.AUTOTUNE)

# Prefetch
train_ds = train_ds.cache().shuffle(1000).prefetch(tf.data.AUTOTUNE)
val_ds   = val_ds.cache().prefetch(tf.data.AUTOTUNE)

# Sanity check
for images, labels in train_ds.take(1):
    print("Image batch shape:", images.shape)
    print("Label batch shape:", labels.shape)

print("✅ Preprocessing completed (corrupt files skipped).")


Found 14282 files belonging to 7 classes.
Using 11426 files for training.
Found 14282 files belonging to 7 classes.
Using 2856 files for validation.
Detected Classes: ['cardboard', 'e-waste', 'glass', 'medical', 'metal', 'paper', 'plastic']
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.
Image batch shape: (32, 224, 224, 3)
Label batch shape: (32,)
✅ Preprocessing completed (corrupt files skipped).


In [15]:
import os
from PIL import Image
import random

# ===============================
# PATHS
# ===============================
SOURCE_DIR = r"C:\Users\User\Downloads\TrashBox-main (1)\TrashBox-main\TrashBox_train_set"
OUTPUT_DIR = r"C:\Users\User\Downloads\TrashBox_preprocessed"

IMAGE_SIZE = (224, 224)
TRAIN_RATIO = 0.7
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15
SEED = 42

VALID_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".gif")
random.seed(SEED)

# ===============================
# CLASSES
# ===============================
classes = [d for d in os.listdir(SOURCE_DIR)
           if os.path.isdir(os.path.join(SOURCE_DIR, d))]

# ===============================
# CREATE OUTPUT DIRS
# ===============================
for split in ["train", "val", "test"]:
    for cls in classes:
        os.makedirs(os.path.join(OUTPUT_DIR, split, cls), exist_ok=True)

# ===============================
# COLLECT IMAGE PATHS
# ===============================
all_images = []

for cls in classes:
    cls_dir = os.path.join(SOURCE_DIR, cls)
    for f in os.listdir(cls_dir):
        if f.lower().endswith(VALID_EXTS):
            all_images.append((cls, os.path.join(cls_dir, f)))

print(f"Total images found: {len(all_images)}")

# ===============================
# SHUFFLE & SPLIT
# ===============================
random.shuffle(all_images)

n_total = len(all_images)
n_train = int(n_total * TRAIN_RATIO)
n_val   = int(n_total * VAL_RATIO)

train_images = all_images[:n_train]
val_images   = all_images[n_train:n_train + n_val]
test_images  = all_images[n_train + n_val:]

# ===============================
# SAVE FUNCTION
# ===============================
def save_images(image_list, split_name):
    saved = 0
    skipped = 0

    for idx, (cls, img_path) in enumerate(image_list):
        try:
            with Image.open(img_path) as img:
                img = img.convert("RGB")
                img = img.resize(IMAGE_SIZE)

                save_path = os.path.join(
                    OUTPUT_DIR, split_name, cls, f"{cls}_{idx}.jpg"
                )

                img.save(save_path, format="JPEG", quality=95)
                saved += 1

        except Exception:
            skipped += 1

    print(f"{split_name.upper()} → saved: {saved}, skipped: {skipped}")

# ===============================
# SAVE DATASETS
# ===============================
save_images(train_images, "train")
save_images(val_images, "val")
save_images(test_images, "test")

print("✅ Preprocessed dataset saved as Train / Val / Test.")


Total images found: 14282
TRAIN → saved: 9995, skipped: 2
VAL → saved: 2142, skipped: 0
TEST → saved: 2142, skipped: 1
✅ Preprocessed dataset saved as Train / Val / Test.


In [None]:
NODEL TRAINING (RESNET50)

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize

# ===============================
# 0) PATHS (EDIT IF NEEDED)
# ===============================
DATA_ROOT = r"C:\Users\User\Downloads\TrashBox_preprocessed"

TRAIN_DIR = os.path.join(DATA_ROOT, "train")
VAL_DIR   = os.path.join(DATA_ROOT, "val")
TEST_DIR  = os.path.join(DATA_ROOT, "test")

OUT_DIR = os.path.join(DATA_ROOT, "_training_outputs_resnet50")
CKPT_DIR = os.path.join(OUT_DIR, "checkpoints")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

# ===============================
# 1) CONFIG
# ===============================
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
SEED = 42

# Training strategy:
# Stage 1: Train classifier head (base frozen)
# Stage 2: Fine-tune last layers of ResNet50 (base partially unfrozen)
EPOCHS_STAGE1 = 8
EPOCHS_STAGE2 = 10

LR_STAGE1 = 1e-3
LR_STAGE2 = 1e-5

# ===============================
# 2) LOAD DATASETS (TRAIN / VAL / TEST)
# ===============================
train_ds = tf.keras.utils.image_dataset_from_directory(
    TRAIN_DIR,
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int"
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    VAL_DIR,
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int"
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    TEST_DIR,
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=False  # IMPORTANT for consistent evaluation order
)

class_names = train_ds.class_names
num_classes = len(class_names)

print("\nDetected classes:", class_names)
print("Num classes:", num_classes)

# Save label mapping for later scan/upload inference
label_map_path = os.path.join(OUT_DIR, "class_names.json")
with open(label_map_path, "w", encoding="utf-8") as f:
    json.dump({"class_names": class_names}, f, indent=2)
print(f"Saved class_names to: {label_map_path}")

# ===============================
# 3) PREPROCESSING + AUGMENTATION
#    - ResNet50 expects preprocess_input
#    - Augmentation only on training set
# ===============================
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
    layers.RandomContrast(0.2),
    layers.RandomBrightness(0.2),
], name="augment")

def preprocess_train(images, labels):
    images = tf.cast(images, tf.float32)
    images = data_augmentation(images, training=True)
    images = preprocess_input(images)  # ResNet50 preprocessing
    return images, labels

def preprocess_eval(images, labels):
    images = tf.cast(images, tf.float32)
    images = preprocess_input(images)
    return images, labels

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.map(preprocess_train, num_parallel_calls=AUTOTUNE).shuffle(1000).prefetch(AUTOTUNE)
val_ds   = val_ds.map(preprocess_eval,  num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)
test_ds  = test_ds.map(preprocess_eval, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

# ===============================
# 4) BUILD RESNET50 MODEL
# ===============================
inputs = layers.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3))

base_model = ResNet50(
    include_top=False,
    weights="imagenet",
    input_tensor=inputs
)
base_model.trainable = False  # Stage 1: freeze backbone

x = base_model.output
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = models.Model(inputs=inputs, outputs=outputs)

model.summary()

# ===============================
# 5) CALLBACKS (progress + save each epoch)
# ===============================
# Saves a model file every epoch (and includes val_accuracy in filename)
checkpoint_path = os.path.join(CKPT_DIR, "epoch_{epoch:02d}_valacc_{val_accuracy:.4f}.keras")
ckpt_cb = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor="val_accuracy",
    save_best_only=False,     # save EVERY epoch
    save_weights_only=False,
    verbose=1
)

csv_log_path = os.path.join(OUT_DIR, "training_log.csv")
csv_logger = CSVLogger(csv_log_path, append=True)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=6,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.3,
    patience=3,
    min_lr=1e-7,
    verbose=1
)

callbacks = [ckpt_cb, csv_logger, early_stop, reduce_lr]

# ===============================
# 6) STAGE 1 TRAINING (HEAD ONLY)
# ===============================
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR_STAGE1),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)

print("\n===== STAGE 1: Training classifier head (base frozen) =====")
history1 = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS_STAGE1,
    verbose=1,          # shows epoch progress
    callbacks=callbacks
)

# ===============================
# 7) STAGE 2 FINE-TUNING (UNFREEZE LAST BLOCKS)
# ===============================
print("\n===== STAGE 2: Fine-tuning (unfreeze part of ResNet50) =====")

base_model.trainable = True

# Unfreeze only the last ~30 layers (tune this if needed)
for layer in base_model.layers[:-30]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR_STAGE2),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)

history2 = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS_STAGE2,
    verbose=1,
    callbacks=callbacks
)

# Combine histories for plotting
def combine_histories(h1, h2):
    hist = {}
    for k in h1.history.keys():
        hist[k] = h1.history[k] + h2.history.get(k, [])
    return hist

full_hist = combine_histories(history1, history2)

# ===============================
# 8) SAVE FINAL MODEL
# ===============================
final_model_path = os.path.join(OUT_DIR, "trashbox_resnet50_final.keras")
model.save(final_model_path)
print(f"\n✅ Final model saved to: {final_model_path}")

# ===============================
# 9) PLOT TRAIN vs VAL GRAPHS
# ===============================
def plot_training_curves(hist, out_dir):
    # Accuracy
    plt.figure()
    plt.plot(hist["accuracy"], label="Train Acc")
    plt.plot(hist["val_accuracy"], label="Val Acc")
    plt.title("Training vs Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    acc_path = os.path.join(out_dir, "train_val_accuracy.png")
    plt.savefig(acc_path, dpi=200, bbox_inches="tight")
    plt.close()

    # Loss
    plt.figure()
    plt.plot(hist["loss"], label="Train Loss")
    plt.plot(hist["val_loss"], label="Val Loss")
    plt.title("Training vs Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    loss_path = os.path.join(out_dir, "train_val_loss.png")
    plt.savefig(loss_path, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved plots:\n- {acc_path}\n- {loss_path}")

plot_training_curves(full_hist, OUT_DIR)

# ===============================
# 10) EVALUATE ON TEST SET
# ===============================
test_loss, test_acc = model.evaluate(test_ds, verbose=1)
print(f"\n✅ Test Accuracy: {test_acc:.4f}")
print(f"✅ Test Loss: {test_loss:.4f}")

# ===============================
# 11) CONFUSION MATRIX + PRECISION/RECALL/F1
# ===============================
# Collect predictions
y_true = []
y_prob = []

for batch_images, batch_labels in test_ds:
    probs = model.predict(batch_images, verbose=0)
    y_prob.append(probs)
    y_true.append(batch_labels.numpy())

y_true = np.concatenate(y_true)
y_prob = np.concatenate(y_prob)
y_pred = np.argmax(y_prob, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix (Test Set)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.colorbar()

tick_marks = np.arange(num_classes)
plt.xticks(tick_marks, class_names, rotation=45, ha="right")
plt.yticks(tick_marks, class_names)

# Add counts on cells (optional)
thresh = cm.max() * 0.6
for i in range(num_classes):
    for j in range(num_classes):
        plt.text(j, i, str(cm[i, j]),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

cm_path = os.path.join(OUT_DIR, "confusion_matrix.png")
plt.savefig(cm_path, dpi=200, bbox_inches="tight")
plt.close()
print(f"Saved confusion matrix: {cm_path}")

# Precision/Recall/F1 report
report = classification_report(y_true, y_pred, target_names=class_names, digits=4)
report_path = os.path.join(OUT_DIR, "classification_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report)

print("\n✅ Classification Report (saved):", report_path)
print(report)

# ===============================
# 12) ROC CURVES (One-vs-Rest)
# ===============================
# Binarize labels for ROC
y_true_bin = label_binarize(y_true, classes=list(range(num_classes)))

plt.figure(figsize=(9, 7))
for i in range(num_classes):
    fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{class_names[i]} (AUC={roc_auc:.3f})")

plt.plot([0, 1], [0, 1], linestyle="--")
plt.title("ROC Curves (Test Set, One-vs-Rest)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right", fontsize=8)

roc_path = os.path.join(OUT_DIR, "roc_curves.png")
plt.savefig(roc_path, dpi=200, bbox_inches="tight")
plt.close()
print(f"Saved ROC curves: {roc_path}")

print("\n✅ ALL DONE. Outputs saved in:", OUT_DIR)

# ===============================
# 13) INFERENCE HELPER (Scan/Upload)
#     Use this later in your app/backend:
# ===============================
def predict_single_image(image_path):
  
    img = tf.keras.utils.load_img(image_path, target_size=IMG_SIZE)
    x = tf.keras.utils.img_to_array(img)
    x = np.expand_dims(x, axis=0).astype(np.float32)
    x = preprocess_input(x)

    probs = model.predict(x, verbose=0)[0]
    idx = int(np.argmax(probs))
    return class_names[idx], float(probs[idx])

# Example usage (uncomment to test):
# pred_class, conf = predict_single_image(r"C:\path\to\some_waste.jpg")
# print("Pred:", pred_class, "Conf:", conf)


Found 11423 files belonging to 7 classes.
Found 4675 files belonging to 7 classes.
Found 2142 files belonging to 7 classes.

Detected classes: ['cardboard', 'e-waste', 'glass', 'medical', 'metal', 'paper', 'plastic']
Num classes: 7
Saved class_names to: C:\Users\User\Downloads\TrashBox_preprocessed\_training_outputs_resnet50\class_names.json
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 0us/step



===== STAGE 1: Training classifier head (base frozen) =====
Epoch 1/8
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 823ms/step - accuracy: 0.5559 - loss: 1.3104
Epoch 1: saving model to C:\Users\User\Downloads\TrashBox_preprocessed\_training_outputs_resnet50\checkpoints\epoch_01_valacc_0.8361.keras
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 1s/step - accuracy: 0.5562 - loss: 1.3094 - val_accuracy: 0.8361 - val_loss: 0.4885 - learning_rate: 0.0010
Epoch 2/8
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 824ms/step - accuracy: 0.7752 - loss: 0.6586
Epoch 2: saving model to C:\Users\User\Downloads\TrashBox_preprocessed\_training_outputs_resnet50\checkpoints\epoch_02_valacc_0.8558.keras
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 1s/step - accuracy: 0.7752 - loss: 0.6586 - val_accuracy: 0.8558 - val_loss: 0.4295 - learning_rate: 0.0010
Epoch 3/8
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [