# Worm detector using a random forest classifier

## Load libraries

In [19]:
import os
import numpy as np
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from datetime import datetime

## Path to data

In [23]:
path_to_train_dataset = 'worm_categories/train'
path_to_test_dataset = 'worm_categories/val'
data_for_grid_search = 'worm_categories/full_dataset' 
path_to_models = 'models'

## Function to load data

In [3]:
# Define a function to load images and convert them into flattened arrays
def load_images(folder):
    images = []
    labels = []
    for label in os.listdir(folder):
        label_folder = os.path.join(folder, label)
        if not os.path.isdir(label_folder):
            continue
        for file in os.listdir(label_folder):
            file_path = os.path.join(label_folder, file)
            image = Image.open(file_path)
            image = image.resize((128, 128))  # Resize the image
            np_image = np.array(image).flatten()  # Flatten the image
            images.append(np_image)
            labels.append(label)
    return np.array(images), np.array(labels)

In [4]:

# Load your datasets
X_train, y_train = load_images(path_to_train_dataset)
X_test, y_test = load_images(path_to_test_dataset)


## Simple Random Forest Classifier

In [5]:
# Create and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6875


## Random forest classifier with grid search and k-fold cross-validation

In [11]:

X, y = load_images(data_for_grid_search)

param_grid = {
    'n_estimators': [10, 50, 100, 200],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth' : [4, 6, 8, 12],  # Maximum number of levels in tree
    'criterion' :['gini', 'entropy']  # Function to measure the quality of a split
}

rf = RandomForestClassifier()

CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5) # 5-fold cross-validation 

In [12]:
CV_rf.fit(X, y)

In [13]:
print("Best Parameters:", CV_rf.best_params_)

Best Parameters: {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}


In [14]:
best_model = CV_rf.best_estimator_

In [24]:
now = datetime.utcnow().strftime('%Y-%m-%d_%H%M')
joblib.dump(best_model, f'{path_to_models}/best_random_forest_model_{now}.pkl')

['models/best_random_forest_model_2023-11-15_1612.pkl']

In [None]:
d