In [26]:
! pip install opencv-python
! pip install scikit-image

import cv2
import numpy as np
import pandas as pd
from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report




In [27]:
# Load your DataFrame
df = pd.read_csv("./docs/fruit_data.csv")

# Convert labels to numerical (0 = Healthy, 1 = Rotten)
df["label"] = df["label"].map({"Healthy": 0, "Rotten": 1})

SAMPLE_SIZE = 1000
df_sampled = df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)

In [28]:
# Function to load and preprocess images
def load_and_preprocess_images(image_paths, img_size=(64, 64)):
    images = []
    for path in image_paths:
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
        if img is not None:
            img = cv2.resize(img, img_size)  # Resize image
            images.append(img)
    return np.array(images)

# Load images using paths from DataFrame
X_images = load_and_preprocess_images(df_sampled["image"])

# Extract HOG features
def extract_hog_features(images):
    features = []
    for img in images:
        hog_feat = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), orientations=9, block_norm='L2-Hys')
        features.append(hog_feat)
    return np.array(features)

X_features = extract_hog_features(X_images)
y = df_sampled["label"].values  # Labels

In [29]:
# Split into training and testing sets
X_train, y_train = X_features[:800], y[:800]
X_dev, y_dev = X_features[800:900], y[800:900]
X_test, y_test = X_features[900:], y[900:]

param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}


# Initialize SVM
svm = SVC()

# Perform grid search with cross-validation on training set
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=5,                # 5-fold CV within training set
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train the model
grid_search.fit(X_train, y_train)

# Get the best model
best_svm = grid_search.best_estimator_

# Evaluate on dev set
y_dev_pred = best_svm.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Dev set accuracy: {dev_accuracy * 100:.2f}%")
print("\nDev Set Classification Report:")
print(classification_report(y_dev, y_dev_pred, target_names=["Healthy", "Rotten"]))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Dev set accuracy: 67.00%

Dev Set Classification Report:
              precision    recall  f1-score   support

     Healthy       0.60      0.75      0.67        44
      Rotten       0.76      0.61      0.67        56

    accuracy                           0.67       100
   macro avg       0.68      0.68      0.67       100
weighted avg       0.69      0.67      0.67       100



In [30]:
# Make predictions and evaluate on test set
y_test_pred = best_svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test set accuracy: {test_accuracy * 100:.2f}%")
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=["Healthy", "Rotten"]))

Test set accuracy: 69.00%

Test Set Classification Report:
              precision    recall  f1-score   support

     Healthy       0.76      0.56      0.64        50
      Rotten       0.65      0.82      0.73        50

    accuracy                           0.69       100
   macro avg       0.70      0.69      0.68       100
weighted avg       0.70      0.69      0.68       100

