In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset, random_split

from datasets import load_dataset

In [9]:
ds = load_dataset("garythung/trashnet")

### Inspecting the Dataset

This section prints the overall dataset structure and previews the first five samples from the training subset.  
It helps verify that the dataset has been loaded correctly and provides an overview of the data fields, including images and their corresponding labels.


In [4]:
print(ds)
train_ds = ds['train']  # or ds['test']
print(train_ds[:5])  # first 5 examples

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 5054
    })
})
{'image': [<PIL.Image.Image image mode=RGB size=3024x4032 at 0x18892879ED0>, <PIL.Image.Image image mode=RGB size=3024x4032 at 0x1889348A6D0>, <PIL.Image.Image image mode=RGB size=4032x3024 at 0x188949F3C50>, <PIL.Image.Image image mode=RGB size=3024x4032 at 0x18894C13BD0>, <PIL.Image.Image image mode=RGB size=3024x4032 at 0x1889343FA10>], 'label': [0, 0, 0, 0, 0]}


### Converting Images to Numpy Arrays

This section preprocesses the images from the training dataset and converts them into numerical arrays suitable for machine learning models.

Each image is:
1. Resized to a fixed dimension of 128×128 pixels to ensure uniform input size.
2. Flattened into a one-dimensional vector for easy storage and compatibility with certain models.
3. Stored along with its label in separate NumPy arrays.

A progress bar (`tqdm`) is used to visualize the preprocessing progress.  
After processing, the resulting feature matrix `X` and label array `y` are displayed with their corresponding shapes.


In [None]:
# Define target image size for resizing
# Smaller dimensions reduce computational cost and memory usage
IMAGE_SIZE = (128, 128)

# Initialize empty lists to store flattened image data and labels
X = []
y = []

# Iterate through each sample in the training dataset with a progress bar
for example in tqdm(train_ds, desc="Processing images"):
    # Resize image to the defined dimensions
    img = example['image'].resize(IMAGE_SIZE)
    # Convert image to a NumPy array and flatten it into a 1D vector
    img_array = np.array(img).flatten()
    # Append flattened image to the feature list
    X.append(img_array)
    # Append the corresponding label
    y.append(example['label'])

# Convert lists to NumPy arrays for numerical computation
X = np.array(X)
y = np.array(y)

# Display the shape of the feature matrix (samples × features)
# and the label vector (samples)
print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)


Processing images: 100%|██████████| 5054/5054 [09:24<00:00,  8.95it/s]


Feature matrix shape: (5054, 49152)
Labels shape: (5054,)


### Splitting Data into Training and Testing Sets

This step divides the dataset into training and testing subsets using an 80/20 ratio.  
The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data.  
A fixed `random_state` ensures the split is reproducible.


In [None]:
# Split the feature matrix (X) and labels (y) into training and testing sets
# test_size=0.2 → 20% of the data is reserved for testing
# random_state=42 → ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training and Evaluating a Random Forest Classifier

This section applies a Random Forest classifier to the processed dataset.

1. **Model creation:** Initializes a Random Forest with 100 decision trees for ensemble learning.  
2. **Training:** Fits the model on the training data (`X_train`, `y_train`).  
3. **Prediction:** Uses the trained model to predict labels for the test set.  
4. **Evaluation:** Assesses performance using accuracy, a classification report (precision, recall, F1-score), and a confusion matrix to visualize misclassifications.


In [None]:
# Create Random Forest classifier
# n_estimators=100 → number of decision trees in the ensemble
# random_state=42 → ensures consistent results across runs
# n_jobs=-1 → utilizes all available CPU cores for faster computation
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
# The classifier learns patterns between input features (X_train) and labels (y_train)
rf.fit(X_train, y_train)

# Generate predictions on the test data
# The model outputs predicted class labels for X_test
y_pred = rf.predict(X_test)

# Evaluate performance metrics
# Accuracy → proportion of correct predictions
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report → includes precision, recall, and F1-score for each class
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix → displays counts of true vs. predicted labels
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6913946587537092

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.73      0.77       176
           1       0.61      0.66      0.63       216
           2       0.59      0.51      0.55       164
           3       0.72      0.85      0.78       220
           4       0.70      0.75      0.72       179
           5       0.92      0.39      0.55        56

    accuracy                           0.69      1011
   macro avg       0.72      0.65      0.67      1011
weighted avg       0.70      0.69      0.69      1011


Confusion Matrix:
 [[129   4  15  21   7   0]
 [  3 142  25  16  29   1]
 [  7  43  84  17  12   1]
 [  8   6   9 187  10   0]
 [  6  20   5  13 135   0]
 [  6  17   4   6   1  22]]
