In [None]:
import warnings;warnings.filterwarnings("ignore") # to remove "warning" output
import os
from tqdm import tqdm
from glob import glob
from pathlib import Path
# Data Analyst & Linear Algebra
import pandas as pd
import numpy as np 
# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
# Image processing
from PIL import Image
import imageio
import skimage
import skimage.io
import skimage.transform
from keras.preprocessing import image
# ML
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC # Support Vector Machine
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbour
from sklearn.linear_model import LogisticRegression # Logistic Regression 
# Deep Learning
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential # Create Layer
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D # Layer by Layer
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator # Image preprocessing
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
# Metrics Evaluation
from sklearn.metrics import * # import all metrics evaluation function

In [None]:
# install library for split from a folder of datasets to  train,test,val folder
!pip install split-folders tqdm

In [None]:
# split folder
import splitfolders
splitfolders.ratio("/content/drive/MyDrive/data_img", output="output", seed=1337, ratio=(.7,0.3)) 

In [None]:
# initialize path
path,dir = "/content/drive/MyDrive/data_img/","/content/drive/MyDrive/data_img"

In [None]:
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        os.path.join(dirname, filename)

In [None]:
# get each file per label
Blight = glob(os.path.join(dir+"/Blight/"+"*"))
Common_Rust = glob(os.path.join(dir+"/Common_Rust/"+"*"))
Gray_Leaf_Spot = glob(os.path.join(dir+"/Gray_Leaf_Spot/"+"*"))
healthy = glob(os.path.join(dir+"/Healthy/"+"*"))
print("Images with Blight, Common_Rust and Gray Leaf Spot are \n\t   "+str(len(Blight))+"  \t"+ str(len(Common_Rust))+"\t\t\t"+str(len(Gray_Leaf_Spot)))
print("Healthy Images are \n   " + str(len(healthy)))

In [None]:
# convert image file into np.array & get label with same length as their length files
blight_array = [np.array(Image.open(x)) for x in tqdm(Blight)]
blight_label = ["Blight" for x in tqdm(Blight)]
rust_array = [np.array(Image.open(x)) for x in tqdm(Common_Rust)]
rust_label = ["Common_Rust" for x in tqdm(Common_Rust)]
Gray_array = [np.array(Image.open(x)) for x in tqdm(Gray_Leaf_Spot)]
Gray_label = ["Gray_Leaf_Spot" for x in tqdm(Gray_Leaf_Spot)]
Health_array = [np.array(Image.open(x)) for x in tqdm(healthy)]
Health_label = ["Healthy" for x in tqdm(healthy)]

In [None]:
# create dataframe for simple model ML
df = pd.DataFrame({"Path":blight_array+rust_array+Gray_array+Health_array,
                   "Label":blight_label+rust_label+Gray_label+Health_label})

In [None]:
# change label from string to int
df.Label = df.Label.replace({'Blight':2, 'Common_Rust':3, 'Gray_Leaf_Spot':4, 'Healthy':1}).astype(int)

In [None]:
# add filename per image
blight_path = [x for x in os.listdir(dir+"/Blight/")]
rust_path = [x for x in os.listdir(dir+"/Common_Rust/")]
Gray_path = [x for x in os.listdir(dir+"/Gray_Leaf_Spot/")]
Health_path = [x for x in os.listdir(dir+"/Healthy/")]

In [None]:
df["FileName"] = blight_path+rust_path+Gray_path+Health_path
df.Label = df.Label.astype(str)

In [None]:
# prepare data, drop path columns... IMG_SIZE is x,y axis of image
IMG_SIZE = 256
data = df.drop("Path", axis=1)

In [None]:
train_datagen = image.ImageDataGenerator(rotation_range = 180,
                                  width_shift_range = 0.1,
                                  height_shift_range = 0.1,
                                  brightness_range = [0.1,1.1],
                                  horizontal_flip = True,
                                  vertical_flip = True,
                                  rescale = 1./255,
                                  zoom_range = 0.5,
                                  validation_split = 0.2)
val_datagen = image.ImageDataGenerator(rescale=1./255,
                                      validation_split = 0.2)

In [None]:
train_generator = train_datagen.flow_from_directory(directory = "/content/output/train",
                                                   target_size = (IMG_SIZE,IMG_SIZE),
                                                   color_mode = "rgb",
                                                   class_mode = "categorical",
                                                   batch_size = 64,
                                                   shuffle = True,)

val_generator = val_datagen.flow_from_directory(directory = "/content/output/train",
                                                   target_size = (IMG_SIZE,IMG_SIZE),
                                                   color_mode = "rgb",
                                                   class_mode = "categorical",
                                                   batch_size = 64,
                                                   shuffle = True)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3,3), activation = 'relu', padding = 'Same',input_shape = [IMG_SIZE,IMG_SIZE,3]),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(128, (3,3), activation = 'relu',padding = 'Same'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(128, (3,3), activation = 'relu',padding = 'Same'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(128, (3,3), activation = 'relu',padding = 'Same'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(256, (3,3), activation = 'relu',padding = 'Same'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024,activation = 'relu'),
    tf.keras.layers.Dense(4, activation = 'softmax')
])

In [None]:
from tensorflow.keras.optimizers import RMSprop,Adam
optimizer = Adam(lr = 0.001)
model.compile(loss = 'categorical_crossentropy',
              optimizer = optimizer,
             metrics = ['accuracy'])

In [None]:
epochs = 10
batch_size = 64

In [None]:
model.summary()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 5, mode = 'min', verbose = 1,
                           restore_best_weights = True)

In [None]:
history = model.fit_generator(train_generator,epochs = epochs,validation_data = val_generator,callbacks = early_stop)

In [None]:
model.save("/content/drive/MyDrive/LeNet_10.h5")

In [None]:
import keras
from matplotlib import pyplot as plt
plt.figure(figsize=(20,10))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.grid(True)
plt.legend(['train', 'val'], loc='upper left');

## Simple Machine Learning

In [None]:
import h5py
import numpy as np
import os
import glob
import cv2
import warnings
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import mahotas
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from random import randint

In [None]:
!pip install mahotas

In [None]:
# Hu Moments
bins             = 8
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature
# Haralick Texture
def fd_haralick(image):
    # convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    # return the result
    return haralick
# Color Histogram
def fd_histogram(image, mask=None):
    # convert the image to HSV color-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # compute the color histogram
    hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
    # normalize the histogram
    cv2.normalize(hist, hist)
    # return the histogram
    return hist.flatten()

In [None]:
# get the training labels
train_path = "/content/output/train"
train_labels = os.listdir(train_path)

# sort the training labels
train_labels.sort()
print(train_labels)

# empty lists to hold feature vectors and labels
global_features = []
labels          = []

In [None]:
# loop over the training data sub-folders
for nums, training_name in tqdm(enumerate(train_labels)):

    # join the training data path and each species training folder
    dir = os.path.join(train_path, training_name)

    # get the current training label
    current_label = training_name

    # loop over the images in each sub-folder
    for x in os.listdir(f"{train_path}/{train_labels[nums]}"):

        # get the image file name
        file = dir + "/" + x

        # read the image and resize it to a fixed-size
        image = cv2.imread(file)
        image = cv2.resize(image, tuple((500,500)))

        # Global Feature extraction
        fv_hu_moments = fd_hu_moments(image)
        fv_haralick   = fd_haralick(image)
        fv_histogram  = fd_histogram(image)

        # Concatenate / Combine Global Feature
        global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])

        # update the list of labels and feature vectors
        labels.append(current_label)
        global_features.append(global_feature)

    print("[STATUS] processed folder: {}".format(current_label))

print("[STATUS] completed Global Feature Extraction...")

In [None]:
# get the overall feature vector size
print("[STATUS] feature vector size {}".format(np.array(global_features).shape))

# get the overall training label size
print("[STATUS] training Labels {}".format(np.array(labels).shape))

# encode the target labels
targetNames = np.unique(labels)
le          = LabelEncoder()
target      = le.fit_transform(labels)
print("[STATUS] training labels encoded...")

# scale features in the range (0-1)
scaler            = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)
print("[STATUS] feature vector normalized...")

print("[STATUS] target labels: {}".format(target))
print("[STATUS] target labels shape: {}".format(target.shape))

# save the feature vector using HDF5
h5f_data = h5py.File(h5_data, 'w')
h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))

h5f_label = h5py.File(h5_labels, 'w')
h5f_label.create_dataset('dataset_1', data=np.array(target))

h5f_data.close()
h5f_label.close()

print("[STATUS] end of training..")

In [None]:
# tunable-parameters
num_trees = 100
test_size = 0.10
seed      = 9
train_path = "/content/output/train"
test_path  = "/content/output/val"
h5_data    = 'data.h5'
h5_labels  = 'labels.h5'
scoring    = "accuracy"

In [None]:
# get the training labels
train_labels = os.listdir(train_path)

# sort the training labels
train_labels.sort()

if not os.path.exists(test_path):
    os.makedirs(test_path)

# create all the machine learning models
models = []
models.append(('Logistic Regression', LogisticRegression(random_state=seed)))
models.append(('K-Nearest Neighbors', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier(random_state=seed)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=num_trees, random_state=seed)))
models.append(('Support Vector Machine', SVC(random_state=seed)))

# variables to hold the results and names
results = []
names   = []

# import the feature vector and trained labels
h5f_data  = h5py.File(h5_data, 'r')
h5f_label = h5py.File(h5_labels, 'r')

global_features_string = h5f_data['dataset_1']
global_labels_string   = h5f_label['dataset_1']

global_features = np.array(global_features_string)
global_labels   = np.array(global_labels_string)

h5f_data.close()
h5f_label.close()

# verify the shape of the feature vector and labels
print("[STATUS] features shape: {}".format(global_features.shape))
print("[STATUS] labels shape: {}".format(global_labels.shape))

print("[STATUS] training started...")

In [None]:
# split the training and testing data
(trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal) = train_test_split(np.array(global_features),
                                                                                          np.array(global_labels),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)

print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))

In [None]:
# 10-fold cross validation
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, trainDataGlobal, trainLabelsGlobal, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f" % (name, cv_results.mean())
    print(msg)

In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(25,10))
fig.suptitle('Machine Learning algorithm comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names);

In [None]:
# Conclusion => Compare with 5 Models Above (Without LeNet5), "Model Random Forest" is the "highest" accuracy model in this datasets

## Create Test DataFrame => Predict

In [None]:
from keras.models import load_model
model = load_model('/content/drive/MyDrive/LeNet_10.h5')

In [None]:
def pred_LeNet(path):
  img = cv2.imread(path)
  try:
    img = img.reshape(-1,256,256,3)
  except:
    img = cv2.resize(img, dsize=(256, 256), interpolation=cv2.INTER_CUBIC).reshape(-1,256,256,3)
  results = model.predict(img)
  results = np.argmax(results,axis = 1)
  return results

In [None]:
path_file, label_file = [],[]

In [None]:
for x in os.listdir("/content/drive/MyDrive/data_img"):
  for y in os.listdir("/content/drive/MyDrive/data_img/"+x):
    path_file.append("/content/drive/MyDrive/data_img/"+x+"/"+y)
    label_file.append(x)

In [None]:
data_ready = pd.DataFrame({"Filename":path_file,
                           "Label File":label_file})

In [None]:
f1,f2,f3 = [],[],[]

In [None]:
for x in tqdm(data_ready.Filename):
  # read the image and resize it to a fixed-size
  image = cv2.imread(x)
  image = cv2.resize(image, tuple((500,500)))

  # Global Feature extraction
  fv_hu_moments = fd_hu_moments(image)
  fv_haralick   = fd_haralick(image)
  fv_histogram  = fd_histogram(image)

  f1.append(fd_hu_moments(image).mean())
  f2.append(fd_haralick(image).mean())
  f3.append(fd_histogram(image).mean())

In [None]:
data_ready["Feature 1"] = f1
data_ready["Feature 2"] = f2
data_ready["Feature 3"] = f3
data_ready["Label File"]= data_ready["Label File"].replace({'Blight':0, 'Gray_Leaf_Spot':1, 'Common_Rust':2, 'Healthy':3})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_ready.drop(["Filename","Label File"], axis=1).values,
                                                    data_ready["Label File"].values,
                                                    test_size=0.3,
                                                    shuffle=True)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
# fit model
Logreg = LogisticRegression(C=float(10**5), class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=9, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False).fit(X_train, y_train)
Knn =  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform').fit(X_train, y_train)
DCT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=9, splitter='best').fit(X_train, y_train)
RFC = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False).fit(X_train, y_train)
SVM =SVC(C=10**2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=9, shrinking=True, tol=0.001,
    verbose=False).fit(X_train, y_train)

In [None]:
# evaluate each models (original) 

In [None]:
logreg_pred = Logreg.predict(X_test)
results = confusion_matrix(y_test, logreg_pred)
print ('Confusion Matrix :\n')
print(results)
print ('\nAccuracy Score is',accuracy_score(y_test, logreg_pred))
print ('\nClassification Report : \n')
print (classification_report(y_test, logreg_pred))

In [None]:
Knn_pred = Knn.predict(X_test)
results = confusion_matrix(y_test, Knn_pred)
print ('Confusion Matrix :\n')
print(results)
print ('\nAccuracy Score is',accuracy_score(y_test, Knn_pred))
print ('\nClassification Report : \n')
print (classification_report(y_test, Knn_pred))

In [None]:
DCT_pred = DCT.predict(X_test)
results = confusion_matrix(y_test, DCT_pred)
print ('Confusion Matrix :\n')
print(results)
print ('\nAccuracy Score is',accuracy_score(y_test, DCT_pred))
print ('\nClassification Report : \n')
print (classification_report(y_test, DCT_pred))

In [None]:
RFC_pred = RFC.predict(X_test)
results = confusion_matrix(y_test, RFC_pred)
print ('Confusion Matrix :\n')
print(results)
print ('\nAccuracy Score is',accuracy_score(y_test, RFC_pred))
print ('\nClassification Report : \n')
print (classification_report(y_test, RFC_pred))

In [None]:
SVM_pred = SVM.predict(X_test)
results = confusion_matrix(y_test, SVM_pred)
print ('Confusion Matrix :\n')
print(results)
print ('\nAccuracy Score is',accuracy_score(y_test, SVM_pred))
print ('\nClassification Report : \n')
print (classification_report(y_test, SVM_pred))

In [None]:
log = Logreg.predict(data_ready.drop(["Filename","Label File"], axis=1))
knn = Knn.predict(data_ready.drop(["Filename","Label File"], axis=1))
rfc = RFC.predict(data_ready.drop(["Filename","Label File"], axis=1))
dct = DCT.predict(data_ready.drop(["Filename","Label File"], axis=1))
svm = SVM.predict(data_ready.drop(["Filename","Label File"], axis=1))

In [None]:
data_ready["Logistic Regression"] = log
data_ready["K-Nearest Neighbour"] = knn
data_ready["Random Forest"] = rfc
data_ready["Decision Tree"] = dct
data_ready["Support Vector Machine"] = svm

In [None]:
cnn = [pred_LeNet(x) for x in data_ready["Filename"]]

In [None]:
data_ready["L-Net5"] = cnn

In [None]:
data_ready.head()

In [None]:
submit = data_ready.drop(["Filename","Label File", "Feature 1", "Feature 2", "Feature 3"], axis=1)
submit["FIlename"] = df.FileName

In [None]:
submit.head()

In [None]:
submit.to_csv("Submission.csv", index=False)