In [1]:
import cv2
import numpy as np
from pathlib import Path
import joblib
from tqdm import tqdm
import time
# scikit-learn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction import image
from sklearn.base import BaseEstimator, TransformerMixin
from skimage.feature import hog


In [2]:
class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for image feature extraction using scikit-learn pipeline"""
    def __init__(self, target_size=(64, 64)):
        self.target_size = target_size
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features_list = []
        
        for img in tqdm(X, desc="Extracting features"):
            # Ensure correct size
            if img.shape[:2] != self.target_size:
                img = cv2.resize(img, self.target_size)
            
            # Convert to grayscale if needed
            if len(img.shape) > 2:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Extract features using scikit-learn's image features
            hog_feat = hog(img, orientations=8, pixels_per_cell=(16, 16),
                               cells_per_block=(1, 1), visualize=False)
            
            # Basic statistical features
            stat_features = [
                np.mean(img),
                np.std(img),
                np.percentile(img, 25),
                np.percentile(img, 75)
            ]
            
            # Combine all features
            combined_features = np.concatenate([hog_feat, stat_features])
            features_list.append(combined_features)
            
        return np.array(features_list)

In [3]:
class OliveFlyDetector:
    def __init__(self, n_trees=100, max_depth=10):
        """Initialize detector with scikit-learn pipeline"""
        self.pipeline = Pipeline([
            ('feature_extractor', ImageFeatureExtractor()),
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(
                n_estimators=n_trees,
                max_depth=max_depth,
                n_jobs=1,
                random_state=42
            ))
        ])
        
    def train(self, X, y):
        """Train the model using scikit-learn pipeline"""
        # Split data for validation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        print("Training model...")
        start_time = time.time()
        # Fixed: Changed fit_transform to fit
        self.pipeline.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Print validation results
        y_pred = self.pipeline.predict(X_val)
        print(f"\nTraining completed in {training_time:.2f} seconds")
        print("\nModel Performance:")
        print(classification_report(y_val, y_pred))
        
    def predict_batch(self, images, image_paths):
        """Predict multiple images with progress bar"""
        results = []
        
        for img, path in tqdm(zip(images, image_paths), total=len(images), desc="Processing images"):
            prediction, probability = self.predict(img)
            results.append({
                'path': path,
                'prediction': 'Olive Fly' if prediction else 'Other Insect',
                'confidence': probability[1] if prediction else probability[0]
            })
            
        return results
    
    def predict(self, image):
        """Predict single image"""
        X = np.array([image])
        prediction = self.pipeline.predict(X)[0]
        probability = self.pipeline.predict_proba(X)[0]
        return prediction, probability
    
    def save_model(self, filepath):
        """Save scikit-learn pipeline"""
        joblib.dump(self.pipeline, filepath)
    
    @classmethod
    def load_model(cls, filepath):
        """Load scikit-learn pipeline"""
        instance = cls()
        instance.pipeline = joblib.load(filepath)
        return instance

In [4]:
def load_dataset(data_folder):
    """Load training data with progress bar"""
    images = []
    labels = []
    paths = []
    
    data_path = Path(data_folder)
    
    # Count total files for progress bar
    total_files = len(list((data_path / 'olive_flies').glob('*.jpg'))) + \
                  len(list((data_path / 'other_insects').glob('*.jpg')))
    
    with tqdm(total=total_files, desc="Loading dataset") as pbar:
        # Load positive examples
        for img_path in (data_path / 'olive_flies').glob('*.jpg'):
            img = cv2.imread(str(img_path))
            if img is not None:
                images.append(img)
                labels.append(1)
                paths.append(str(img_path))
            pbar.update(1)
        
        # Load negative examples
        for img_path in (data_path / 'other_insects').glob('*.jpg'):
            img = cv2.imread(str(img_path))
            if img is not None:
                images.append(img)
                labels.append(0)
                paths.append(str(img_path))
            pbar.update(1)
    
    return np.array(images), np.array(labels), paths

def predict_images(test_folder, model_path):
    """Predict all images in a folder"""
    detector = OliveFlyDetector.load_model(model_path)
    
    # Load test images
    images = []
    image_paths = []
    test_path = Path(test_folder)
    
    print(f"\nLoading test images from {test_folder}...")
    for img_path in tqdm(list(test_path.glob('*.jpg')), desc="Loading test images"):
        img = cv2.imread(str(img_path))
        if img is not None:
            images.append(img)
            image_paths.append(str(img_path))
    
    if not images:
        print("No images found in test folder!")
        return
    
    # Process all images
    results = detector.predict_batch(images, image_paths)
    
    # Print results
    print("\nResults:")
    print("-" * 80)
    print(f"{'Image':<50} | {'Prediction':<15} | {'Confidence':<10}")
    print("-" * 80)
    for result in results:
        img_name = Path(result['path']).name
        print(f"{img_name:<50} | {result['prediction']:<15} | {result['confidence']:.2f}")

def main():
    """Main execution"""
    MODEL_PATH = 'olive_fly_model.joblib'
    DATA_FOLDER = 'training_data'
    TEST_FOLDER = 'test_images'  # Folder with images to classify
    
    if not Path(MODEL_PATH).exists():
        print("Training new model...")
        
        # Load dataset
        images, labels, _ = load_dataset(DATA_FOLDER)
        if len(images) == 0:
            print(f"No training images found in {DATA_FOLDER}")
            print("Please create folders:")
            print(f"  {DATA_FOLDER}/olive_flies/")
            print(f"  {DATA_FOLDER}/other_insects/")
            return
        
        # Train detector
        detector = OliveFlyDetector()
        detector.train(images, labels)
        detector.save_model(MODEL_PATH)
        print(f"Model saved to {MODEL_PATH}")
    
    # Predict test images
    predict_images(TEST_FOLDER, MODEL_PATH)

if __name__ == "__main__":
    main()

Training new model...


Loading dataset: 100%|███████████████████████████████████████████████████████████| 2336/2336 [00:00<00:00, 2535.96it/s]


Training model...


Extracting features: 100%|████████████████████████████████████████████████████████| 1868/1868 [00:02<00:00, 779.54it/s]
Extracting features: 100%|██████████████████████████████████████████████████████████| 468/468 [00:00<00:00, 701.31it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training completed in 4.42 seconds

Model Performance:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       400
           1       0.00      0.00      0.00        68

    accuracy                           0.85       468
   macro avg       0.43      0.50      0.46       468
weighted avg       0.73      0.85      0.79       468

Model saved to olive_fly_model.joblib

Loading test images from test_images...


Loading test images: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 45.17it/s]
Processing images:   0%|                                                                        | 0/14 [00:00<?, ?it/s]
Extracting features: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 332.04it/s]

Extracting features: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 335.30it/s]

Extracting features: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.99it/s]

Extracting features: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 499.98it/s]
Processing images:  14%|█████████▏                                                      | 2/14 [00:00<00:01, 10.09it/s]
Extracting features: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 478.26it/s]

Extracting features: 100%|██████████


Results:
--------------------------------------------------------------------------------
Image                                              | Prediction      | Confidence
--------------------------------------------------------------------------------
castellar_2_1 156 referencia.JPG                   | Other Insect    | 0.72
castellar_2_1 157 referencia.JPG                   | Other Insect    | 0.70
castellar_2_1 160 referencia.JPG                   | Other Insect    | 0.61
castellar_2_1 162 referencia.JPG                   | Other Insect    | 0.76
castellar_2_1 169 referencia.JPG                   | Other Insect    | 0.85
castellar_2_1 184 referencia.JPG                   | Other Insect    | 0.71
castellar_2_1 191 referencia.JPG                   | Other Insect    | 0.66
castellar_2_1 192 referencia.JPG                   | Other Insect    | 0.63
castellar_2_1 194 referencia.JPG                   | Other Insect    | 0.71
castellar_2_1 196 referencia.JPG                   | Other Ins