# Dog Breed Prediction 

### Exploratory Data Analysis

In [None]:
from PIL import Image
from matplotlib import image
from matplotlib import pyplot


images_dir = "./data/images/"

In [None]:
def get_image_info(image_file):
    im = Image.open(f'{image_file}')
    print("Image Format: ", im.format)
    print("Image Mode: ", im.mode)
    print("Image Size: ", im.size)
    data = image.imread(f'{image_file}')

    print("datatype: ", data.dtype)
    print("shape: ", data.shape)

    pyplot.imshow(data)
    pyplot.show()

In [None]:
get_image_info(f'{images_dir}affenpinscher-7.jpg')

In [None]:
get_image_info(f'{images_dir}American Staffordshire terrier-1.jpg')

In [None]:
get_image_info(f'{images_dir}Newfoundland-148.jpg')

In [None]:
import os
import pandas as pd

def gather_image_info(directory):
    image_data = []

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
            filepath = os.path.join(directory, filename)
            try:
                with Image.open(filepath) as img:
                    info = {
                        'Filename': filename,
                        'File Type': img.format,
                        'Shape': img.size,
                        'Image Mode': img.mode
                    }
                    image_data.append(info)
            except IOError:
                print(f"Cannot open {filename}")
                
    df = pd.DataFrame(image_data)
    return df


image_info_df = gather_image_info(images_dir)


In [None]:
print(image_info_df.head())

In [None]:
image_info_df.describe()

In [None]:
not_jpeg_images = image_info_df[image_info_df['File Type'] != 'JPEG']
print(not_jpeg_images)

In [None]:
get_image_info(f'{images_dir}Shetland sheepdog-23.jpg')

Note that from what is shown above, the image "Shetland sheepdog-23.jpg" is not like the rest of the images as it has 4 color channels and is a PNG image. As this is the only image like this, we will remove the image from the data set. This is curious because the image ends with the '.jpg' file type, but it has 4 color channels. 

Other than that everything looks good. ALl images seem to be JPEG images with 3 color channels, RGB. One issue is that the images are all of different sizes, so we will have to resize all of the images.

## Preprocessing and Processing Data

In [None]:
# We will be resizing the images to (224, 224, 3)
img_width, img_height = 224, 224 
channels = 3
image_arr_size= img_width * img_height * channels

In [None]:
import os
import numpy as np
from PIL import Image

def load_images_and_labels(directory, exclude_file='Shetland sheepdog-23.jpg'):
    images = []
    labels = []
    
    for filename in os.listdir(directory):
        if filename.lower().endswith('.jpg') and filename != exclude_file:
            file_path = os.path.join(directory, filename)
            
            with Image.open(file_path) as img:
                img = img.convert('RGB')
                img = img.resize((224, 224))

                images.append(np.array(img))
            
                label = filename.split('-')[0]
                labels.append(label)
    
    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels

images, labels = load_images_and_labels(images_dir)

In [None]:
print("Loaded", images.shape[0], "images.")
print("Images shape:", images.shape[1:])
print("Labels:", labels)
print(f"There are {len(np.unique(labels))} unique labels")

In [None]:
import pandas as pd
labels_pd = pd.DataFrame(labels)
labels_pd.info()
labels_pd.describe()

In [None]:
print(images[1][1][1]) # first image, on pixel x=1, y=1, gives the RGB values

Normalizing data

In [None]:
def normalize(image_array):
    data_all_resized = []
    for image in image_array:
        image_resized = np.array(image, dtype=np.float32) / 255.0
        img_array = np.array(image_resized)
        data_all_resized.append(img_array)
        
    return np.array(data_all_resized)

In [None]:
images = normalize(images)

In [None]:
print(images[1][1][1]) # images are now normalized

### Splitting Data for testing and training

*Note: gonna try using train_test_split because I like it, may have memory issues but we'll see*

In [None]:
from sklearn.model_selection import train_test_split

X = images
y = labels

print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## At this point we have:

- Loaded the data, visualized it, checked for characteristics such as image size, format, etc.
- Put the data into python data types (np.arrays) as the variables data_all and labels_all, which holds all the data.
- Rescaled the data to (224, 224, 3)
- Normalized the data
- Split the data into testing and training sets

## Model design, training and evaluation

#### Callback function

In [None]:
# From class notes: 
from tensorflow.keras.callbacks import LambdaCallback
 # Define a callback function to print weights and biases at the end of each epoch
def print_weights_and_biases(epoch, logs):
    if epoch % 1 == 0:  # Print every epoch
        print(f"\nWeights and Biases at the end of Epoch {epoch}:")
        for layer in model.layers:
            print(f"Layer: {layer.name}")
            weights, biases = layer.get_weights()
            print(f"Weights:\n{weights}")
            print(f"Biases:\n{biases}")

# Create a LambdaCallback to call the print_weights_and_biases function
print_weights_callback = LambdaCallback(on_epoch_end=print_weights_and_biases)

## Artificial Nueral Network

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten

# image_arr_size = 128*128*3
image_size = (224, 224, 3)
image_arr_size = img_width * img_height * channels


Ann = Sequential() # initialize model

Ann.add(Flatten(input_shape=image_size)) # input layer

Ann.add(Dense(512, activation='relu', input_shape=(image_arr_size,)))
Ann.add(Dense(256, activation='relu'))
Ann.add(Dense(128, activation='relu'))
Ann.add(Dense(64, activation='relu'))

# output layer
Ann.add(Dense(120, activation='softmax')) # using softmax because its a multi-classification problemAnn.summary()

In [None]:
Ann.summary()

In [None]:
from tensorflow.keras.models import load_model

if os.path.isfile("./models/Ann.keras"):
    Ann = load_model('./models/Ann.keras')

else: # this takes like an hour to run 
    Ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    Ann.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=64, verbose=2)
    Ann.save("./models/Ann.keras")