In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from skimage.feature import hog, local_binary_pattern
from collections import Counter
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Set dataset path
dataset_path = r"D:\ML\Assignment 1\tomato\train"

# Get class names
classes = os.listdir(dataset_path)
print(f"Classes found: {classes}")

# Dictionary to store image counts
image_counts = {}
all_images = []
all_labels = []
color_hist_features = []
hog_features = []
lbp_features = []

def preprocess_image(img_path):
    """Preprocess the image - resize, grayscale."""
    img = cv2.imread(img_path)
    img_resized = cv2.resize(img, (128, 128))
    img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
    return img_resized, img_gray

def extract_color_histogram(img, bins=(16, 16, 16)):
    """Extract color histogram from the image."""
    hist = cv2.calcHist([img], [0, 1, 2], None, bins, [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(img_gray):
    """Extract HOG features from grayscale image."""
    features = hog(img_gray, pixels_per_cell=(8, 8), cells_per_block=(3, 3), 
                   visualize=False, block_norm='L2-Hys')
    return np.array(features)

def extract_lbp_features(img_gray, radius=3, n_points=24):
    """Extract Local Binary Pattern (LBP) features from grayscale image."""
    lbp = local_binary_pattern(img_gray, n_points, radius, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)
    return hist

# Read images and apply preprocessing
total_images = 0
for cls in classes:
    class_path = os.path.join(dataset_path, cls)
    images = os.listdir(class_path)
    image_counts[cls] = len(images)
    total_images += len(images)
    
    for img_name in images[:100]:  # Increased sample size per class
        img_path = os.path.join(class_path, img_name)
        img_resized, img_gray = preprocess_image(img_path)
        
        all_images.append(img_resized)
        all_labels.append(cls)
        
        color_hist_features.append(extract_color_histogram(img_resized))
        hog_features.append(extract_hog_features(img_gray))
        lbp_features.append(extract_lbp_features(img_gray))

print(f"Total number of images found: {total_images}")




Classes found: ['Tomato___Bacterial_spot', 'Tomato___Early_blight', 'Tomato___healthy', 'Tomato___Late_blight', 'Tomato___Leaf_Mold', 'Tomato___Septoria_leaf_spot', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Target_Spot', 'Tomato___Tomato_mosaic_virus', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus']
Total number of images found: 10000


In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from skimage.feature import hog, local_binary_pattern
from collections import Counter
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Set dataset path
dataset_path = r"D:\ML\Assignment 1\tomato\train"

# Get class names
classes = os.listdir(dataset_path)
print(f"Classes found: {classes}")

# Feature containers
image_counts = {}
all_images = []
all_labels = []
color_hist_features = []
hog_features = []
lbp_features = []

# Functions for preprocessing and feature extraction
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    img_resized = cv2.resize(img, (128, 128))
    img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
    return img_resized, img_gray

def extract_color_histogram(img, bins=(16, 16, 16)):
    hist = cv2.calcHist([img], [0, 1, 2], None, bins, [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(img_gray):
    return hog(img_gray, pixels_per_cell=(8, 8), cells_per_block=(3, 3), visualize=False, block_norm='L2-Hys')

def extract_lbp_features(img_gray, radius=3, n_points=24):
    lbp = local_binary_pattern(img_gray, n_points, radius, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)
    return hist

# Load images and extract features
total_images = 0
for cls in classes:
    class_path = os.path.join(dataset_path, cls)
    images = os.listdir(class_path)
    image_counts[cls] = len(images)
    total_images += len(images)
    
    for img_name in images[:100]:  # Limit per class
        img_path = os.path.join(class_path, img_name)
        img_resized, img_gray = preprocess_image(img_path)

        all_images.append(img_resized)
        all_labels.append(cls)

        color_hist_features.append(extract_color_histogram(img_resized))
        hog_features.append(extract_hog_features(img_gray))
        lbp_features.append(extract_lbp_features(img_gray))

print(f"Total number of images found: {total_images}")

# Combine features
X = np.hstack((hog_features, color_hist_features, lbp_features))
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Encode labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(all_labels)

# Apply LDA
lda = LDA(n_components=None)
X_lda = lda.fit_transform(X, Y)

# Check label distribution
print(f"Label distribution: {Counter(Y)}")

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X_lda, Y, test_size=0.2, random_state=42, stratify=Y)

### === SVM ===
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm_model = SVC()
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=5, n_jobs=-1, verbose=1)
grid_search_svm.fit(X_train, Y_train)
best_svm_model = SVC(**grid_search_svm.best_params_)
best_svm_model.fit(X_train, Y_train)

### === Decision Tree ===
param_grid_dt = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
dt_model = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, Y_train)
best_dt_model = DecisionTreeClassifier(**grid_search_dt.best_params_, random_state=42)
best_dt_model.fit(X_train, Y_train)

### === Random Forest ===
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, Y_train)
best_rf_model = RandomForestClassifier(**grid_search_rf.best_params_, random_state=42)
best_rf_model.fit(X_train, Y_train)

### === Ensemble Model ===
ensemble_model = VotingClassifier(
    estimators=[
        ('svm', best_svm_model),
        ('dt', best_dt_model),
        ('rf', best_rf_model)
    ],
    voting='hard'  # change to 'soft' if all models support predict_proba
)

ensemble_model.fit(X_train, Y_train)
Y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(Y_test, Y_pred_ensemble)
print(f"Ensemble Model Accuracy: {accuracy * 100:.2f}%")
print("Ensemble Classification Report:")
print(classification_report(Y_test, Y_pred_ensemble, target_names=label_encoder.classes_))

print("✅ Ensemble Model Training and Evaluation Completed!")


Classes found: ['Tomato___Bacterial_spot', 'Tomato___Early_blight', 'Tomato___healthy', 'Tomato___Late_blight', 'Tomato___Leaf_Mold', 'Tomato___Septoria_leaf_spot', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Target_Spot', 'Tomato___Tomato_mosaic_virus', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus']
Total number of images found: 10000
Label distribution: Counter({0: 100, 1: 100, 9: 100, 2: 100, 3: 100, 4: 100, 5: 100, 6: 100, 8: 100, 7: 100})
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Ensemble Model Accuracy: 94.50%
Ensemble Classification Report:
                                               precision    recall  f1-score   support

                      Tomato___Bacterial_spot       0.91      1.00      0.95        20
                        Tomato___Early_blight       0.95      0.90      0.92        20
                         Tomat

In [3]:
import joblib

# Save the ensemble model
joblib.dump(ensemble_model, 'ensemble_model.pkl')

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

print("Model and Label Encoder saved successfully!")


Model and Label Encoder saved successfully!


In [3]:
import joblib
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load your trained models (make sure these are trained on THIS machine)
best_svm_model = joblib.load("best_svm_model.pkl")
best_dt = joblib.load("best_dt_model.pkl")
best_rf_model = joblib.load("best_rf_model.pkl")
label_encoder = joblib.load("label_encoder.pkl")

# Create ensemble
ensemble_model = VotingClassifier(estimators=[
    ('svm', best_svm_model),
    ('dt', best_dt),
    ('rf', best_rf_model)
], voting='hard')

# Train ensemble model again if needed (skip if already trained)
# ensemble_model.fit(X_train, Y_train)

# Save models with your local environment
joblib.dump(ensemble_model, "ensemble_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


FileNotFoundError: [Errno 2] No such file or directory: 'best_svm_model.pkl'

In [3]:
import os
import cv2
import numpy as np
from skimage.feature import hog, local_binary_pattern
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Set dataset path
dataset_path = r"D:\ML\Assignment 1\tomato\train"

# Get class names
classes = os.listdir(dataset_path)
print(f"Classes found: {classes}")

# Storage
all_images = []
all_labels = []
color_hist_features = []
hog_features = []
lbp_features = []

def preprocess_image(img_path):
    img = cv2.imread(img_path)
    img_resized = cv2.resize(img, (128, 128))
    img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
    return img_resized, img_gray

def extract_color_histogram(img, bins=(16, 16, 16)):
    hist = cv2.calcHist([img], [0, 1, 2], None, bins, [0, 256]*3)
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(img_gray):
    features = hog(img_gray, pixels_per_cell=(8, 8), cells_per_block=(3, 3), 
                   visualize=False, block_norm='L2-Hys')
    return np.array(features)

def extract_lbp_features(img_gray, radius=3, n_points=24):
    lbp = local_binary_pattern(img_gray, n_points, radius, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)
    return hist

# Read and process images
for cls in classes:
    class_path = os.path.join(dataset_path, cls)
    images = os.listdir(class_path)[:1000]  # Sample 100 images per class
    
    for img_name in images:
        img_path = os.path.join(class_path, img_name)
        img_resized, img_gray = preprocess_image(img_path)
        
        all_images.append(img_resized)
        all_labels.append(cls)
        
        color_hist_features.append(extract_color_histogram(img_resized))
        hog_features.append(extract_hog_features(img_gray))
        lbp_features.append(extract_lbp_features(img_gray))

# Combine features
X = np.hstack((hog_features, color_hist_features, lbp_features))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Encode labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(all_labels)

# Apply LDA
lda = LDA(n_components=None)
X_lda = lda.fit_transform(X_scaled, Y)

# Train/Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_lda, Y, test_size=0.2, random_state=42, stratify=Y)

# ------------------------- SVM -------------------------
svm_params = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm_model = SVC()
grid_svm = GridSearchCV(svm_model, svm_params, cv=5, n_jobs=-1, verbose=1)
grid_svm.fit(X_train, Y_train)
best_svm_model = grid_svm.best_estimator_

# ------------------------- Decision Tree -------------------------
dt_params = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy']
}
dt_model = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt_model, dt_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_dt.fit(X_train, Y_train)
best_dt = grid_dt.best_estimator_

# ------------------------- Random Forest -------------------------
rf_params = {
    'n_estimators': [100],
    'max_depth': [None, 10],
    'min_samples_split': [2],
    'min_samples_leaf': [1, 2]
}
rf_model = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, Y_train)
best_rf_model = grid_rf.best_estimator_

# ------------------------- Ensemble -------------------------
ensemble = VotingClassifier(
    estimators=[
        ('svm', best_svm_model),
        ('dt', best_dt),
        ('rf', best_rf_model)
    ],
    voting='hard'
)
ensemble.fit(X_train, Y_train)

# Evaluation
Y_pred = ensemble.predict(X_test)
print(f"Ensemble Accuracy: {accuracy_score(Y_test, Y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(Y_test, Y_pred, target_names=label_encoder.classes_))

# Save all required components
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(lda, 'lda.pkl')
joblib.dump(ensemble, 'ensemble_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

print("✅ All models and components saved successfully!")

Classes found: ['Tomato___Bacterial_spot', 'Tomato___Early_blight', 'Tomato___healthy', 'Tomato___Late_blight', 'Tomato___Leaf_Mold', 'Tomato___Septoria_leaf_spot', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Target_Spot', 'Tomato___Tomato_mosaic_virus', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus']
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Ensemble Accuracy: 99.95%
Classification Report:
                                               precision    recall  f1-score   support

                      Tomato___Bacterial_spot       1.00      1.00      1.00       200
                        Tomato___Early_blight       1.00      1.00      1.00       200
                         Tomato___Late_blight       1.00      1.00      1.00       200
                           Tomato___Leaf_Mold       1.00      1.00      1.00       200
                  Tom

In [1]:
import sklearn
print(sklearn.__version__)


1.6.1


In [2]:
!pip install scikit-learn==1.6.1 --upgrade


Collecting scikit-learn==1.6.1
  Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.1
    Uninstalling joblib-1.1.1:
      Successfully uninstalled joblib-1.1.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.1
    Uninstalling scikit-learn-1.2.1:
      Successfully uninstalled scikit-learn-1.2.1


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\krish\\anaconda3\\Lib\\site-packages\\~klearn\\utils\\murmurhash.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

