### Import Necessary Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import pandas as pd 
import os 
from skimage.transform import resize 
from skimage.io import imread 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import svm 
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
import glob
import shutil
import os
from tqdm import tqdm

### Data Preprocessing

The dataset is: https://www.kaggle.com/c/plant-pathology-2020-fgvc7/data

The problem statement is to classify images using classical machine learning techniques like SVM or Random Forest without using any transfer learning.

In [2]:
# Read the csv file
df=pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0
2,Train_2,1,0,0,0
3,Train_3,0,0,1,0
4,Train_4,1,0,0,0


In [4]:
df.tail()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
1816,Train_1816,0,0,0,1
1817,Train_1817,1,0,0,0
1818,Train_1818,1,0,0,0
1819,Train_1819,0,0,1,0
1820,Train_1820,0,0,0,1


In [5]:
src_dir='images/data'
dst_dir=[
    'images/labelled_data/healthy', 
    'images/labelled_data/multiple_diseases',
    'images/labelled_data/rust', 
    'images/labelled_data/scab']

In [6]:
category=['healthy', 'multiple_diseases', 'rust', 'scab']

In [10]:
# Copy each image to its desired location by the help of given train.csv
for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
    s=int(jpgfile[18:-4])
    for i in range(4):
        if df.iloc[s][category[i]]==1:
            shutil.copy(jpgfile, dst_dir[i])    

### Data Preprocessing

In [7]:
# Convert all the images to (150, 150) with 3 color channels and flatten it to form a 1D array and appent its label to the end of it and create the final flat data

flat_data_arr=[]
target_arr=[]
datadir='images/labelled_data/' 
for i in category: 
      
    print(f'loading... category : {i}') 
    path=os.path.join(datadir, i) 
    for img in tqdm(os.listdir(path)): 
        img_array=imread(os.path.join(path,img)) 
        img_resized=resize(img_array, (150, 150, 3)) 
        flat_data_arr.append(img_resized.flatten()) 
        target_arr.append(category.index(i)) 
    print(f'loaded category:{i} successfully') 
flat_data=np.array(flat_data_arr) 
target=np.array(target_arr)

loading... category : healthy


  0%|          | 0/516 [00:00<?, ?it/s]

100%|██████████| 516/516 [04:05<00:00,  2.10it/s]


loaded category:healthy successfully
loading... category : multiple_diseases


100%|██████████| 91/91 [00:41<00:00,  2.20it/s]


loaded category:multiple_diseases successfully
loading... category : rust


100%|██████████| 622/622 [05:31<00:00,  1.88it/s]


loaded category:rust successfully
loading... category : scab


100%|██████████| 592/592 [05:26<00:00,  1.82it/s]


loaded category:scab successfully


In [8]:
target

array([0, 0, 0, ..., 3, 3, 3])

In [9]:
len(target)

1821

In [10]:
target.shape

(1821,)

In [11]:
flat_data.shape

(1821, 67500)

In [12]:
# Save the numpy arrays for future access and easy access
np.save('flatdata', flat_data)
np.save('target', target)

In [13]:
flat_data=np.load('flatdata.npy')
target=np.load('target.npy')

In [14]:
# Convert the flat data and the target to a Pandas Dataframe
df=pd.DataFrame(flat_data)  
df['Target']=target 
df.shape

(1821, 67501)

In [15]:
# Create the input and output data  
x=df.iloc[:,:-1]  
y=df.iloc[:,-1]

In [16]:
# Splitting the data into training and testing sets 
x_train, x_test, y_train,y_test=train_test_split(x, y, test_size=0.20, random_state=69, stratify=y) 

GridSearchCV takes all possible parameters of `C`, `gamma` and `kernel` and chooses the best model which fits the dataset. This is useful for Hyperparameter Optimisation of the SVM layer. However the dataset has 1821 pictures with (150 * 150 * 3)+1 that is a total of 1821*67501=122919321 values which is very large. Additionally processing this through GridSearchCV makes this process highly time and energy consuming.

So instead we just use the kernel `rbf` which is proven through research to be the best kernel for image classification. The value for `gamma` is also given the default value. The only varying parameter is `C` which is tested for [0.01, 0.1, 1, 10, 100] and the best accuracy is given with `C=1`

In [19]:
# # Defining the parameters grid for GridSearchCV 
# param_grid={'C': [0.1, 1, 10, 100], 
# 			'gamma':[0.0001, 0.001, 0.01, 0.1, 1], 
# 			'kernel':['rbf']} 

# # Creating a support vector classifier 
# svc=svm.SVC(probability=True) 

# # Creating a model using GridSearchCV with the parameters grid 
# model=GridSearchCV(svc, param_grid)

In [17]:
# model2=svm.SVC(kernel='rbf', C=0.01)      -->34
# model2=svm.SVC(kernel='rbf', C=10)        -->44
# model4=svm.SVC(kernel='rbf', C=100)       -->44.9
model=svm.SVC(kernel='rbf', C=1)           #-->49.3

In [18]:
# Training the model using the training data 
model.fit(x_train, y_train)

In [19]:
y_pred=model.predict(x_test) 

In [20]:
# Check the accuracy
accuracy=accuracy_score(y_pred, y_test) 
print(f"The model is {accuracy*100}% accurate")

The model is 49.31506849315068% accurate


As we can see that the accuracy is only 49% which is very low for a model like this. This is the reason why SVM is not used nowadays for image processing. Today people use neural based architectures like CNNs or transfer learning like Resnet, Alexnet or VGG16/19. This is because the neural network is better suited for image processsing using Conv2D, Pooling or Dense layers. After the processing the classification can be done by using SVM on the last layer of the neural network as follows:

Without the SVM layer the output layer would be somewhat like this: 

`cnn.add(Dense(units=1, activation='sigmoid'))`



Incorporating the SVM layer, the output layer would be: 

1. For Binary Classification
    
`cnn.add(Dense(1, kernel_regularizer=l2(0.01), activation='linear'))`
`cnn.compile(optimizer='adam', loss='hinge', metrics=['accuracy'])`

2. For MultiClass Classification

`cnn.add(Dense(4, kernel_regularizer=l2(0.01), activation='softmax'))`
`cnn.compile(optimizer='adam', loss='squared_hinge', metrics=['accuracy'])`


