In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

from cnn_utils import get_labels_and_sub_images, display_result_metrics

In [None]:
# Load in Data
data_path = "../Data"

# Bounding box data
with open(f"{data_path}/water_bb_data.pkl", "rb") as file:
    bb_data = pickle.load(file)

# Image data
with open(f"{data_path}/water_images.pkl", "rb") as file:
    img_data= pickle.load(file)

In [None]:
# Separate the images and image data between water and no water
label_1_img, label_1, label_0_img, label_0 = get_labels_and_sub_images(img_data, bb_data, threshold=0)

In [None]:
transfomed_images = []
transformed_labels = []

#Add mirror images and horizontal flip
for sub_image in label_1_img:
    
    # Apply tranformations
    mirrored_image = np.fliplr(sub_image)
    horizontal_flipped_image = np.flipud(sub_image)

    # Add new data
    transfomed_images.append(mirrored_image)
    transfomed_images.append(horizontal_flipped_image)

    transformed_labels.append(1)
    transformed_labels.append(1)

# Combine images
label_1_img += transfomed_images
label_1 += transformed_labels

In [None]:
total_images = np.array(label_1_img + label_0_img)
total_labels = np.array(label_1 + label_0)

In [None]:
# Split between test and train sets
training_input, testing_input, training_label, testing_label = train_test_split(
    total_images, 
    total_labels, 
    test_size=.2, 
    random_state=5
)

# Split between Training and validation set
training_input, validation_input, training_label, validation_label = train_test_split(
    training_input, 
    training_label, 
    test_size=.2, 
    random_state = 5
)

In [None]:
def calculate_average_rgb(images: np.array) -> np.array:
    """Calculates the Average Red, Green, and Blue values for each sub-image 

    Args:
        images (np.array): Numpy array containing all sub-images

    Returns:
        np.array: Numpy array containing the average red, green, and blue values for each sub-image
    """
    average_rgb_images = [
        img.reshape(22500, 3).mean(axis=0)
        for img in images
    ]
    return np.array(average_rgb_images)

In [None]:
# Convert sub-images into average RGB values
training_input = calculate_average_rgb(training_input)
validation_input = calculate_average_rgb(validation_input)
testing_input = calculate_average_rgb(testing_input)

In [None]:
# Create random forest classifier
random_forest = RandomForestClassifier(random_state=5)

# Fit training data
random_forest.fit(training_input, training_label)

In [None]:
def evaluate_random_forest(classifier, input:np.array, actual_labels:np.array) -> None:
    """Evaluates the performance for the water classifier

    Args:
        classifier (sklearn classifier): Classifier used for the water detector
        input (np.array): Input for the model to make predictions
        actual_labels (np.array): true labels for each sub-image
    """
    # Get predictions
    predicted_probablities = classifier.predict(input)
    
    # Get predicted labels
    predicted_labels= [1 if prob >.5 else 0 for prob in predicted_probablities]

    # Display metrics
    display_result_metrics(actual_labels, predicted_labels)

In [None]:
# Training
evaluate_random_forest(random_forest, training_input, training_label)

In [None]:
# Validation
evaluate_random_forest(random_forest, validation_input, validation_label)

In [None]:
# Testing
evaluate_random_forest(random_forest, testing_input, testing_label)

In [None]:
model_name = "water_classifier"

with open(f"../Models/water_classifier/{model_name}", "wb") as file:
    pickle.dump(random_forest, file)