# **# Week 1: Data Exploration and Preprocessing**
## Objective
The goal of this notebook is to perform the initial steps of our project:
1.  **Explore** the raw datasets to understand their structure and class distribution.
2.  **Analyze** the image counts to identify any class imbalance.
3.  **Define a preprocessing strategy** to merge and structure the data into our four target categories.
4.  **Execute** the preprocessing script to create a clean, organized dataset for model training.

**Target Categories:**
- `Organic_Compostable`
- `Recyclables`
- `E-waste_Hazardous`
- `Landfill_General`

# **1: Install & Setup**

In [1]:
import os
import zipfile
import shutil
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from google.colab import files

# 1. Upload Kaggle API Token
print("Upload your kaggle.json file now:")
files.upload()

# 2. Setup Kaggle Config
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 3. Download Dataset (mostafaabla/garbage-classification)
print("Downloading dataset...")
!kaggle datasets download -d mostafaabla/garbage-classification
print("Unzipping...")
with zipfile.ZipFile("garbage-classification.zip", 'r') as zip_ref:
    zip_ref.extractall("temp_data")

print("✅ Data Downloaded.")

Upload your kaggle.json file now:


Saving kaggle.json to kaggle.json
Downloading dataset...
Dataset URL: https://www.kaggle.com/datasets/mostafaabla/garbage-classification
License(s): ODbL-1.0
Downloading garbage-classification.zip to /content
 57% 136M/239M [00:00<00:00, 1.42GB/s]
100% 239M/239M [00:00<00:00, 658MB/s] 
Unzipping...
✅ Data Downloaded.


# **2: Organize Data** -

In [2]:
# Define Paths
BASE_DIR = 'final_dataset'
RAW_DIR = 'temp_data/garbage_classification' # Standard extraction path for this dataset

# Create Target Directories
categories = ['Recyclables', 'Organic_Compostable', 'E-waste_Hazardous', 'Landfill_General']
if os.path.exists(BASE_DIR):
    shutil.rmtree(BASE_DIR) # Clean start
os.makedirs(BASE_DIR)

for cat in categories:
    os.makedirs(os.path.join(BASE_DIR, cat))

# MAPPING DICTIONARY (Kaggle Folder Name -> Your App Category)
mapping = {
    'paper': 'Recyclables',
    'cardboard': 'Recyclables',
    'plastic': 'Recyclables',
    'metal': 'Recyclables',
    'glass': 'Recyclables',
    'brown-glass': 'Recyclables',
    'white-glass': 'Recyclables',
    'green-glass': 'Recyclables',
    'biological': 'Organic_Compostable',
    'battery': 'E-waste_Hazardous',
    'trash': 'Landfill_General',
    'shoes': 'Landfill_General',
    'clothes': 'Landfill_General'
}

print("Reorganizing data...")
total_moved = 0
# The dataset usually extracts into a subfolder, let's find it
# Walk through temp_data to find where the images are
for root, dirs, files in os.walk("temp_data"):
    for dir_name in dirs:
        if dir_name in mapping:
            target_cat = mapping[dir_name]
            source_folder = os.path.join(root, dir_name)
            dest_folder = os.path.join(BASE_DIR, target_cat)

            # Move files
            for filename in os.listdir(source_folder):
                shutil.copy(os.path.join(source_folder, filename), os.path.join(dest_folder, filename))
                total_moved += 1

print(f"✅ Sorted {total_moved} images into 4 categories.")

Reorganizing data...
✅ Sorted 15515 images into 4 categories.


# **3: Train the Model**
> This trains for 5 epochs; enough for a demo.

In [3]:
# Setup Data Generators
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    horizontal_flip=True
)

train_gen = datagen.flow_from_directory(
    BASE_DIR,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

val_gen = datagen.flow_from_directory(
    BASE_DIR,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

# Build Model
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False # Freeze base

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.2)(x)
predictions = Dense(4, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
print("Starting training (Grab a coffee, this takes ~10 mins)...")
history = model.fit(train_gen, epochs=5, validation_data=val_gen)

# Save Model
model.save('waste_classifier.h5')
print("✅ Model Saved as waste_classifier.h5")

# IMPORTANT: Print Class Order for app.py
print("\n⚠️ COPY THIS LIST FOR YOUR APP.PY:")
print(list(train_gen.class_indices.keys()))

Found 12413 images belonging to 4 classes.
Found 3102 images belonging to 4 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Starting training (Grab a coffee, this takes ~10 mins)...


  self._warn_if_super_not_called()


Epoch 1/5
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m860s[0m 2s/step - accuracy: 0.8535 - loss: 0.4031 - val_accuracy: 0.9278 - val_loss: 0.2041
Epoch 2/5
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m859s[0m 2s/step - accuracy: 0.9546 - loss: 0.1274 - val_accuracy: 0.9368 - val_loss: 0.1791
Epoch 3/5
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m846s[0m 2s/step - accuracy: 0.9604 - loss: 0.1117 - val_accuracy: 0.9410 - val_loss: 0.1697
Epoch 4/5
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m838s[0m 2s/step - accuracy: 0.9623 - loss: 0.1069 - val_accuracy: 0.9562 - val_loss: 0.1311
Epoch 5/5
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m853s[0m 2s/step - accuracy: 0.9658 - loss: 0.0967 - val_accuracy: 0.9429 - val_loss: 0.1766




✅ Model Saved as waste_classifier.h5

⚠️ COPY THIS LIST FOR YOUR APP.PY:
['E-waste_Hazardous', 'Landfill_General', 'Organic_Compostable', 'Recyclables']


In [4]:
from google.colab import files
files.download('waste_classifier.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>