## Importing the necessary libraries

In [None]:
import os
import mlflow
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from skimage import color
from skimage.transform import resize
from skimage.io import imread
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA

In [None]:
# Setting mlfow experiment
mlflow.set_experiment('SVM')

## Loading the data
After cleaning the data and applying data augmentation, we load the image data into a pandas DataFrame

In [None]:
categories = ['NG', 'OK']
# Arreglo que contendra las imagenes en una 1D
flat_images = []
# Arreglo que contendra la categoria de la imagen i.
target_array = []
# Path que contiene las carpetas NG y OK
data_path = 'C:/Users/abrah/Documents/Repos/RPatrones/ClasificaImagenes/data/Limpieza01/Final'

In [None]:
for i in categories:
    print(f'Loading... category: {i}')
    path = os.path.join(data_path,i)
    
    for img in os.listdir(path):
        img_array = imread(os.path.join(path, img))
        img_resized = resize(img_array,(256,256,1))
        flat_images.append(img_resized.flatten())
        target_array.append(categories.index(i))

In [None]:
df = pd.DataFrame(np.array(flat_images))
df['labels'] = np.array(target_array)
print(len(flat_images))
print(len(target_array))

In [None]:
print(df.head())
print(f'Columns of the data frame: {df.columns}')

## Principal Component Analysis (PCA)
"PCA is a linear dimensionality reduction technique (algorithm) that transform a set of correlated variables (p) into a smaller k ($k < p$) number of uncorrelated variables called **principal components** while keeping as much of the variability in the original data as possible."
### Why do we use it?
We use PCA for **image compression**, a technique that minimizes the size in byts of an image while keeping as much of the quality of the image as possible. Reducing the number of components in an image will help us train a little faster.
#### Source:
Rukshan Pramoditha: https://towardsdatascience.com/image-compression-using-principal-component-analysis-pca-253f26740a9f


In [None]:
# Create a copy of the pandas dataframe that contains our data
data = df.copy()
data.drop(columns='labels',inplace=True)

In [None]:
mlflow.sklearn.autolog()

In [None]:
n_components = 171
mlflow.log_param('PCA_n_components',str(n_components))

In [None]:
pca = PCA(n_components=n_components)

In [None]:
# Asignamos el número de componentes
brake_pca_reduced = pca.fit_transform(data)
brake_pca_recovered = pca.inverse_transform(brake_pca_reduced)
image_pca_1000 = brake_pca_recovered[1,:].reshape([256,256])
plt.imshow(image_pca_1000,cmap='gray_r')
plt.title(f'Compressed image, label = {target_array[0]}')

In [None]:
data_reduced = pd.DataFrame(brake_pca_reduced)
d_recov = pd.DataFrame(brake_pca_recovered)

## Building the SVM model
Create the SVM model and use GridSearchCV to find the best model

In [None]:
params = {
    'C':[0.1,1,10,100],
    'gamma':[0.0001,0.001,0.1,1],
    'kernel':['linear','rbf']
}

In [None]:
classifier = svm.LinearSVC()
model = GridSearchCV(classifier,params,cv=5,verbose=1)

#### Splitting the data

In [None]:
targets = df[['labels']]
print(targets.shape)
# Las caracteristicas reducidas se encuentran en data_reduced
X_train, X_test, y_train, y_test = train_test_split(data_reduced,targets,test_size=0.2,random_state=1234,stratify=targets)

#### Fitting the model

In [None]:
model.fit(X_train,y_train.values.ravel())

#### Making predictions

In [None]:
y_pred = model.predict(X_test)
print(f'The predicted data is \n{y_pred}')
print(f'The actual data is \n{y_test}')

In [None]:
image_test = brake_pca_reduced[50]
print(len(image_test))
img_to_show = brake_pca_recovered[50]
print(len(img_to_show))
actual_label = target_array[50]
img_to_show = np.reshape(img_to_show,(256,256))
plt.imshow(img_to_show,cmap='gray_r')
plt.title(f'Actual label = {actual_label}| predicted label = {model.predict(image_test.reshape(1, -1))}')

#### Best Parameters

In [None]:
print(f'Best parameters found by GridSearch:\n{model.best_params_}')
print(f'Best score found by GridSearch:\n{model.best_score_}')

#### Computing classification metrics

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
conf_matrix = confusion_matrix(y_test,y_pred=y_pred)
print(conf_matrix)
print(classification_report(y_test,y_pred))
print(f'The accuraccy is {accuracy_score(y_test,y_pred)}')

In [None]:
ipath = 'C:/Users/abrah/Documents/Repos/RPatrones/ClasificaImagenes/data/NG_trimm/img_39_2022_05_06_151613_120.jpg'
image_test = imread(ipath)
img_t_resized = resize(img_array,(256,256,1))
test = img_t_resized.flatten()
pd_test = pd.DataFrame(np.array(test).reshape(1,-1))
print(pd_test)
final = pca.transform(pd_test)
print(len(img_to_show))
actual_label = target_array[50]
plt.imshow(image_test,cmap='gray_r')
plt.title(f'Actual label = {actual_label} | predicted label = {model.predict(final.reshape(1, -1))}')