# Worm detector using a random forest classifier

## Load libraries

In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import json
from datetime import datetime

## Path to data

In [2]:
path_to_train_dataset = 'worm_categories/train'
path_to_test_dataset = 'worm_categories/val'
data_for_grid_search = 'worm_categories/full_dataset'
more_data_for_grid_search = 'sorted_data'
path_to_models = 'models'
preprocess_image_size = (128,128)

## Pre-process images

In [3]:
def preprocess_image(image_path, size=preprocess_image_size):
    """Load an image, resize, and flatten it."""
    image = Image.open(image_path)
    image = image.resize(size)
    if image.mode != 'L':
        image = image.convert('L')
    np_image = np.array(image).flatten()
    return np_image


## Function to load data

In [4]:
# Define a function to load images and convert them into flattened arrays
# also save the relation between the label names and label indecies for later predictions

def load_images(folder):
    images = []
    labels = []
    label_to_idx = {}  # Mapping dictionary
    current_label_idx = 0

    for label in sorted(os.listdir(folder)):  # Sort to ensure consistent order
        label_folder = os.path.join(folder, label)
        if not os.path.isdir(label_folder):
            continue

        # Assign an index to each new label
        if label not in label_to_idx:
            label_to_idx[label] = current_label_idx
            current_label_idx += 1

        for file in os.listdir(label_folder):
            if file.endswith('.tif'):  # Check if the file is a .tif file
                file_path = os.path.join(label_folder, file)
                np_image = preprocess_image(file_path, size=preprocess_image_size)
                images.append(np_image)
                labels.append(label_to_idx[label])  # Use the numerical label

    return np.array(images), np.array(labels), label_to_idx


In [5]:

# Load your datasets and get the label mapping
X_train, y_train, train_label_to_idx = load_images(path_to_train_dataset)
X_test, y_test, test_label_to_idx= load_images(path_to_test_dataset)


## Simple Random Forest Classifier

In [6]:
# Create and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.75


## Random forest classifier with grid search and k-fold cross-validation

In [7]:
def random_forest_grid_search(data):
    
    X, y, label_to_idx = load_images(data)
    
    param_grid = {
        'n_estimators': [10, 50, 100, 200],  # Number of trees in the forest
        'max_features': ['sqrt', 'log2'],  # Number of features to consider at every split
        'max_depth' : [4, 6, 8, 12],  # Maximum number of levels in tree
        'criterion' :['gini', 'entropy']  # Function to measure the quality of a split
    }
    
    rf = RandomForestClassifier()
    
    CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5) # 5-fold cross-validation 
    CV_rf.fit(X, y)
    
    return CV_rf, label_to_idx

In [8]:
CV_rf, label_to_idx = random_forest_grid_search(data_for_grid_search)

In [9]:
print("Best Parameters:", CV_rf.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 100}


In [10]:
print("Best Score", CV_rf.best_score_)

Best Score 0.75


In [11]:
best_model = CV_rf.best_estimator_
# grab a timestamp
now = datetime.utcnow().strftime('%Y-%m-%d_%H%M')
# save the model
joblib.dump(best_model, f'{path_to_models}/best_random_forest_model_{now}.pkl')
# Save the label mapping
with open(f'{path_to_models}/random_forest_labels_to_idx.json', 'w') as f:
    json.dump(label_to_idx, f)

## With new data

In [12]:
CV_rf, label_to_idx = random_forest_grid_search(more_data_for_grid_search)

In [13]:
print("Best Parameters:", CV_rf.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 100}


In [14]:
print(f"Best Score: {CV_rf.best_score_:0.4f}")

Best Score: 0.9624


In [15]:
best_model = CV_rf.best_estimator_
# grab a timestamp
now = datetime.utcnow().strftime('%Y-%m-%d_%H%M')
# save the model
joblib.dump(best_model, f'{path_to_models}/best_random_forest_model_more_data_{now}.pkl')
# Save the label mapping
with open(f'{path_to_models}/random_forest_labels_to_idx_more_data.json', 'w') as f:
    json.dump(label_to_idx, f)