In [41]:
import numpy as np
import pandas as pd
import lightgbm
import xgboost as xgb

In [3]:
# Get all the names of the subfolders
import os

main_path = '/Users/karlboma/Documents/Python Stuff/Side Projects/Tea Disease Classification Project/tea sickness dataset'

sub_folders = [name for name in os.listdir(main_path) if os.path.isdir(os.path.join(main_path, name))]

print(sub_folders)

['white spot', 'Anthracnose', 'healthy', 'gray light', 'bird eye spot', 'algal leaf', 'brown blight', 'red leaf spot']


In [5]:
# Convert all the images into a numpy array and add it to the data and targe arrays
from PIL import Image
data = []
target = []

for folder in sub_folders:
    image_path = main_path + f"/{folder}"
    for image in os.listdir(image_path):
        img = Image.open(image_path + f"/{image}")
        img_array = np.array(img).flatten()
        data.append(img_array)
        target.append(folder)

In [7]:
data = np.array(data) # Feature array

In [9]:
target = np.array(target) # Target array

In [11]:
# Convert the string labels into numerical labels
labels_dict = {'white spot': 0,
               'Anthracnose': 1,
               'healthy': 2,
               'gray light': 3,
               'bird eye spot': 4,
               'algal leaf': 5,
               'brown blight': 6,
               'red leaf spot': 7}

target = np.vectorize(labels_dict.get)(target)

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 42)

# Train a Light Gradient Boosted Classifier

In [15]:
model = lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', random_state=42)
model.fit(x_train, y_train)

<IPython.core.display.Javascript object>

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.197228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3591549
[LightGBM] [Info] Number of data points in the train set: 708, number of used features: 30000
[LightGBM] [Info] Start training from score -1.880313
[LightGBM] [Info] Start training from score -2.119793
[LightGBM] [Info] Start training from score -2.611200
[LightGBM] [Info] Start training from score -2.131627
[LightGBM] [Info] Start training from score -2.244956
[LightGBM] [Info] Start training from score -2.051585
[LightGBM] [Info] Start training from score -2.029845
[LightGBM] [Info] Start training from score -1.774952


In [17]:
y_pred = model.predict(x_test)

In [19]:
y_test

array([0, 3, 6, 6, 7, 6, 3, 2, 0, 7, 1, 0, 4, 4, 1, 7, 4, 5, 2, 5, 0, 3,
       5, 5, 0, 1, 4, 4, 2, 0, 0, 6, 6, 0, 0, 0, 7, 1, 2, 2, 2, 0, 7, 7,
       2, 5, 5, 5, 2, 3, 0, 3, 5, 5, 2, 1, 5, 4, 7, 2, 4, 6, 4, 5, 0, 7,
       4, 3, 3, 0, 3, 1, 6, 1, 0, 3, 2, 6, 4, 0, 2, 3, 1, 4, 5, 2, 0, 3,
       4, 6, 1, 5, 6, 7, 2, 2, 4, 7, 6, 5, 5, 7, 7, 4, 4, 1, 0, 7, 4, 4,
       3, 0, 3, 1, 4, 0, 3, 6, 5, 5, 2, 6, 7, 4, 4, 6, 5, 1, 0, 6, 4, 6,
       2, 7, 3, 6, 0, 0, 3, 5, 4, 1, 0, 0, 7, 4, 0, 2, 7, 2, 6, 2, 7, 0,
       0, 1, 4, 2, 7, 0, 7, 5, 7, 4, 5, 0, 0, 0, 6, 6, 1, 7, 0, 7, 5, 0,
       2])

In [21]:
from sklearn.metrics import accuracy_score

In [23]:
score = accuracy_score(y_test, y_pred)

In [25]:
score

0.8022598870056498

# Train a Linear Kernel SVM

In [27]:
from sklearn.svm import LinearSVC

In [29]:
model = LinearSVC()
model.fit(x_train, y_train)



In [33]:
svc_pred = model.predict(x_test)

In [35]:
score = accuracy_score(y_test, svc_pred)

In [37]:
score

0.711864406779661

# Train a XGBoost Classifier

In [45]:
model = xgb.XGBClassifier(objective='multiclass',
                          n_estimators=100, 
                          learning_rate=0.1, 
                          max_depth=3) 

model.fit(x_train, y_train)

In [49]:
xgb_pred = model.predict(x_test)

In [53]:
print(accuracy_score(y_test, xgb_pred))

0.807909604519774


In [57]:
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

<IPython.core.display.Javascript object>

In [59]:
scores = cross_val_score(model, data, target, cv=kf, scoring='accuracy') # Perform 5-fold cross validation

# Print results
print(f"Accuracy scores for each fold: {scores}")
print(f"Mean accuracy: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")

<IPython.core.display.Javascript object>

Accuracy scores for each fold: [0.8079096  0.79661017 0.83615819 0.76836158 0.80225989]
Mean accuracy: 0.8023
Standard deviation: 0.0217


In [73]:
import pickle
file_name = "xgb_reg.pkl"

# save
pickle.dump(model, open(file_name, "wb"))

In [75]:
xgb_model_loaded = pickle.load(open(file_name, "rb"))

In [141]:
def image_to_array(path:str, image_name: str):
    img = Image.open(path + f"/{image_name}").resize((100,100))
    if img.mode != 'RGB':
        img = img.convert('RGB')
    return np.array(img).flatten().reshape(1,-1)

In [143]:
arr = image_to_array(path = '/Users/karlboma/Desktop', image_name = 'Screenshot 2025-01-24 at 7.48.22 PM.png')

In [181]:
input = data[9]

In [191]:
xgb_model_loaded.predict(arr)

array([1])

1