
# CNN-based Multi-Class Flower Classification

### Definition of the Problem
The goal of this project is to develop a Convolutional Neural Network (CNN) model that can accurately classify images of flowers into one of five categories: Sunflower, Orchid, Lotus, Lily, and Tulip. Accurate classification of flower species has applications in agriculture, botanical research, education and etc.

### Data Acquisition

The dataset used in this project consists of images of five different flower species. It was obtained from kaggle.com where it was uploaded by Kausthub Kannan(owner) and edited/updated by DesolationOfSmaug(collaborator) (https://www.kaggle.com/datasets/kausthubkannan/5-flower-types-classification-dataset/data). The images are organized into folders named after each flower class. 

To prepare the data for modeling, the dataset will be split into training and testing sets with an 80-20 ratio. This ensures that the model can be trained on a substantial portion of the data while still being evaluated on unseen images. This is the link for the dataset that has already been split to training and testing - https://drive.google.com/drive/folders/1jiiJCfZIb_o87rKZtW6WBA99bzzuI1Cq?usp=sharing.



***Note!*** 

Building and training this model was done in google colab. There could be minor errors you encounter if you try to run it on other platforms. It will be mostly problems relating to the directory of the dataset. All the necessaray information has been provided just change the code for the model to run.


### Loading the Libraries

In [None]:
# Import the required Libraries
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns


### Loading the Dataset

##### Splitting the Dataset into Training and Testing Sets

In [None]:
# Define dataset path and output directory
dataset_dir = '/content/drive/MyDrive/flower_images'  # Original dataset path in Google Drive
output_dir = '/content/drive/MyDrive/flower_data_split'  # Output directory in MyDrive

# Check if the output directory already exists to avoid re-splitting
if not os.path.exists(output_dir):
    import splitfolders  # Ensure you have splitfolders installed
    # Split the dataset into training (80%) and testing (20%)
    splitfolders.ratio(
        dataset_dir,  # Path to the original dataset
        output=output_dir,  # Output directory
        seed=42,  # For reproducibility
        ratio=(0.8, 0.2),  # 80% training, 20% testing
    )

# Define paths for training and testing directories
train_dir = os.path.join(output_dir, 'train')
test_dir = os.path.join(output_dir, 'test')

print(f"Training data directory: {train_dir}")
print(f"Testing data directory: {test_dir}")

### Data Exploration and Analysis

##### Counting the Number of Images per Class
We will analyze the distribution of images across the different classes in both the training and testing datasets

In [None]:
def count_pictures(folder_path):
    """Counts the number of image files in a folder."""
    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')
    image_count = 0
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(image_extensions):
            image_count += 1
    return image_count

# Define the main subdirectories
main_subdirs = ['train', 'test']

# Loop through the main subdirectories
for main_subdir in main_subdirs:
    main_subdir_path = os.path.join(output_dir, main_subdir)
    # Get the subfolders within the main subdirectory
    subfolders = [f for f in os.listdir(main_subdir_path) if os.path.isdir(os.path.join(main_subdir_path, f))]
    # Loop through the subfolders and count pictures
    for subfolder in subfolders:
        folder_path = os.path.join(main_subdir_path, subfolder)
        num_pictures = count_pictures(folder_path)
        print(f"Number of pictures in '{folder_path}': {num_pictures}")

print(f"\n")
output_dir = '/content/drive/MyDrive/flower_data_split'

train_dir = os.path.join(output_dir, 'train')
test_dir = os.path.join(output_dir, 'test')

print("Train - Classes in the dataset:")
for folder in os.listdir(train_dir):
    print(f"- {folder}: {len(os.listdir(os.path.join(train_dir, folder)))} images")

print(f"\n")
print("Test - Classes in the dataset:")
for folder in os.listdir(test_dir):
    print(f"- {folder}: {len(os.listdir(os.path.join(test_dir, folder)))} images")

##### Visualizing the Class Distribution

In [None]:
# Collect class counts
train_counts = {}
test_counts = {}

for folder in os.listdir(train_dir):
    train_counts[folder] = len(os.listdir(os.path.join(train_dir, folder)))

for folder in os.listdir(test_dir):
    test_counts[folder] = len(os.listdir(os.path.join(test_dir, folder)))

# Create DataFrames
train_df = pd.DataFrame(list(train_counts.items()), columns=['Class', 'Count'])
test_df = pd.DataFrame(list(test_counts.items()), columns=['Class', 'Count'])

# Plotting
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.barplot(x='Class', y='Count', data=train_df)
plt.title('Training Set Class Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.barplot(x='Class', y='Count', data=test_df)
plt.title('Testing Set Class Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

### Data Preparation 
##### Data Augmentation and Preprocessing
To enhance the model's ability to generalize, we will apply data augmentation techniques such as rotation, width and height shifts, shear, zoom, and horizontal flipping.

In [None]:
# Data augmentation and preprocessing
train_datagen = ImageDataGenerator(
    rescale=1./255,  # Normalize pixel values
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_datagen = ImageDataGenerator(rescale=1./255)  # Only rescaling for test data

# Creating the data generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(150, 150),  # Resizing images
    batch_size=32,
    class_mode='categorical',
    shuffle=True
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)


##### Visualizing Sample Augmented Images

In [None]:
# Visualize some images from the training set
data_batch, labels_batch = next(train_generator)
plt.figure(figsize=(10, 10))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = data_batch[i]
    plt.imshow(img)
    plt.axis('off')
plt.show()

### Build CNN Model

We will construct a CNN model with convolutional layers followed by pooling layers, and a fully connected layer at the end.

In [None]:
# Build the CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D(2, 2),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(train_generator.num_classes, activation='softmax')  # Output layer
])

model.summary()

### Compiling the Model

We will compile the model using the Adam optimizer and categorical cross-entropy loss function, suitable for multi-class classification.

In [None]:
# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

### Training the Model

We will train the model for up to 500 epochs with early stopping and learning rate reduction callbacks to prevent overfitting and optimize training time.

In [None]:
# Callbacks
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
]

# Train the model
history = model.fit(
    train_generator,
    epochs=500,
    validation_data=test_generator,
    callbacks=callbacks
)

***Note:*** Training for 500 epochs may take a significant amount of time and could lead to overfitting. The EarlyStopping callback will halt training when the model stops improving on the validation set.

### Evaluating the Model
#### Plotting Training and Validation Metrics

In [None]:
# Plot Training and Validation Metrics
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()


##### Saving the Model

In [None]:
# Save the Model
model.save('/content/drive/MyDrive/flower_classification_model.h5')

##### Evaluating on the Test Set

In [None]:
# Evaluate the Model
loss, accuracy = model.evaluate(test_generator)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

### Generating Classification Report and Confusion Matrix
To further assess the model's performance, we will generate a classification report and a confusion matrix.

In [None]:
# Generate Classification Report and Confusion Matrix
test_generator.reset()
Y_pred = model.predict(test_generator)
y_pred = np.argmax(Y_pred, axis=1)
y_true = test_generator.classes

print('Classification Report')
print(classification_report(y_true, y_pred, target_names=test_generator.class_indices.keys()))

conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_generator.class_indices.keys(),
            yticklabels=test_generator.class_indices.keys())
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

##### Prediction on a single image

In [None]:
# Predict on a single image
def load_and_preprocess_image(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(150, 150))
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array /= 255.0
    return img_array

def make_prediction(image_path):
    img_array = load_and_preprocess_image(image_path)
    prediction = model.predict(img_array)
    predicted_class = np.argmax(prediction, axis=1)
    class_indices = {v: k for k, v in test_generator.class_indices.items()}
    predicted_label = class_indices[predicted_class[0]]
    return predicted_label

image_path = 'Drive/sample_flower.jpg'
predicted_label = make_prediction(image_path)
print(f'The predicted class for the image is: {predicted_label}')