## Download & Extract Data

In [3]:
import zipfile

In [2]:
!wget https://github.com/SVizor42/ML_Zoomcamp/releases/download/bee-wasp-data/data.zip

--2023-11-20 08:18:11--  https://github.com/SVizor42/ML_Zoomcamp/releases/download/bee-wasp-data/data.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/405934815/e6c56cb7-dce1-463f-865b-01e913c38485?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20231120%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231120T131812Z&X-Amz-Expires=300&X-Amz-Signature=440651f5b2be760c2dd1f764af22696bebc2de160e17e820a3d38677091f8c16&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=405934815&response-content-disposition=attachment%3B%20filename%3Ddata.zip&response-content-type=application%2Foctet-stream [following]
--2023-11-20 08:18:12--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/405934815/e6c56cb7-dce1-463f-865b-01e913c38485?X-Amz-Algor

In [8]:
local_zip = 'data.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('data')
zip_ref.close()

## EDA

In [4]:
import os

base_dir = 'data/data'

train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

train_bees_dir = os.path.join(train_dir, 'bee')
train_wasps_dir = os.path.join(train_dir, 'wasp')
test_bees_dir = os.path.join(test_dir, 'bee')
test_wasps_dir = os.path.join(test_dir, 'wasp')

In [5]:
# Checking the number of images in each directory
print('Number of training bee images:', len(os.listdir(train_bees_dir)))
print('Number of training wasp images:', len(os.listdir(train_wasps_dir)))
print('Number of test bee images:', len(os.listdir(test_bees_dir)))
print('Number of test wasp images:', len(os.listdir(test_wasps_dir)))

Number of training bee images: 1976
Number of training wasp images: 1701
Number of test bee images: 493
Number of test wasp images: 425


## Convolutional Neural Network Architecture

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import SGD

2023-11-20 12:10:25.969625: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
# Building the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compiling the model with SGD optimizer
sgd_optimizer = SGD(learning_rate=0.002, momentum=0.8)

model.compile(optimizer=sgd_optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 148, 148, 32)      896       
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 74, 74, 32)        0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 175232)            0         
                                                                 
 dense_2 (Dense)             (None, 64)                11214912  
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 11215873 (42.79 MB)
Trainable params: 11215873 (42.79 MB)
Non-trainable params: 0 (0.00 Byte)
____________

## Data Preprocessing

In [8]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [9]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    # rotation_range=40,
    # width_shift_range=0.2,
    # shear_range=0.2,
    # horizontal_flip=True,
    # fill_mode='nearest'
)

In [10]:
test_datagen = ImageDataGenerator(rescale=1./255)

In [19]:
# Flow training images in batches using train_datagen generator

img_size = (150, 150)
batch_size = 20

train_generator = train_datagen.flow_from_directory(
    train_dir,  
    target_size=img_size,  
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)

Found 3677 images belonging to 2 classes.


In [20]:
# Flow validation images in batches using test_datagen generator
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)

Found 918 images belonging to 2 classes.


## Train Model

In [21]:
import scipy

In [22]:

# Training the model
history = model.fit(
    train_generator,
    # steps_per_epoch=train_generator.samples // batch_size,
    epochs=10,
    validation_data=test_generator,
    # validation_steps=test_generator.samples // batch_size
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Questions & Exploration

In [23]:
import numpy as np

# Extracting training accuracy from the history object
training_accuracy = [0.6557, 0.7223, 0.7425, 0.7705, 0.7841, 0.8055, 0.8126, 0.8306, 0.8493, 0.8725]

# Calculating the median
median_training_accuracy = np.median(training_accuracy)

print("Median of training accuracy:", median_training_accuracy)


Median of training accuracy: 0.7948


In [29]:
# Extracting training loss from the history object
training_loss = [0.6146, 0.5567, 0.5286, 0.4924, 0.4696, 0.4441, 0.4305, 0.3939, 0.3584, 0.3282]

# Calculating the standard deviation
std_dev_training_loss = np.std(training_loss)

print("Standard deviation of training loss:", std_dev_training_loss)


Standard deviation of training loss: 0.08500417636798795


## Now with Data Augmentation

In [30]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Updated data generator for training with augmentations
train_datagen_augmented = ImageDataGenerator(
    rescale=1./255,
    rotation_range=50,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Flow training images in batches using train_datagen_augmented generator
train_generator_augmented = train_datagen_augmented.flow_from_directory(
    train_dir,  
    target_size=img_size,  
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)


Found 3677 images belonging to 2 classes.


In [31]:
# Continue training the model with augmented data
history_continued = model.fit(
    train_generator_augmented,
    epochs=10,  # or any desired number of additional epochs
    validation_data=test_generator,
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
# Test loss values for each epoch
test_loss_values = [0.4646, 0.4763, 0.4987, 0.5274, 0.4627, 0.4587, 0.4577, 0.5067, 0.4427, 0.4992]

# Calculating the mean
mean_test_loss = sum(test_loss_values) / len(test_loss_values)

print("Mean of test loss for all epochs:", mean_test_loss)


Mean of test loss for all epochs: 0.47946999999999995
