# Image Preprocessing #

<p> The following notebook defines a few functions for preprocessing the images, visualizing the transformations, and implements different models on the data for classification. The functions will resize the images to (120,120,3), and apply a Gaussian blur. </p?

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
import pandas as pd

# defining global variable path
par_image_path = 'cell_images/cell_images/cell_images/Parasitized'
uni_image_path = 'cell_images/cell_images/cell_images/Uninfected'

# function to load folder into arrays and then it returns that same array
def loadImages(path):
    # Put files into lists and return them as one list of size 4
    image_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if      file.endswith('.png')])
 
    return image_files

In [2]:
# Display one image
def display_one(a, title1 = "Original"):
    plt.imshow(a), plt.title(title1)
    plt.xticks([]), plt.yticks([])
    plt.show()
    
# Display two images
def display(a, b, title1 = "Original", title2 = "Edited"):
    plt.subplot(121), plt.imshow(a), plt.title(title1)
    plt.xticks([]), plt.yticks([])
    plt.subplot(122), plt.imshow(b), plt.title(title2)
    plt.xticks([]), plt.yticks([])
    plt.show()
    
# Preprocessing
def processing(data):
    # loading image
    # Form list of images that can be operated on and displayed
    img = [cv2.imread(i, cv2.IMREAD_UNCHANGED) for i in data]
    print('Original size',img[0].shape)
    
    # setting dimensions of the resize
    height = 120
    width = 120
    dim = (width, height)
    res_img = []
    for i in range(len(img)):
        res = cv2.resize(img[i], dim, interpolation=cv2.INTER_LINEAR)
        res_img.append(res)

    # Checcking the size
    print("RESIZED", res_img[1].shape)
    
    # Visualizing one of the images in the array
    original = res_img[1]
    display_one(original)
    
    # Remove noise
    # Gaussian
    no_noise = []
    for i in range(len(res_img)):
        blur = cv2.GaussianBlur(res_img[i], (5, 5), 0)
        no_noise.append(blur)


    image = no_noise[1]
    display(original, image, 'Original', 'Blurred')
    return no_noise

In [3]:
par_imgs = loadImages(par_image_path)
processed_imgs_par = processing(par_imgs)
uni_imgs = loadImages(uni_image_path)
processed_imgs_uni = processing(uni_imgs)
processed_imgs = processed_imgs_par + processed_imgs_uni

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'cell_images/cell_images/cell_images/Parasitized'

In [None]:
# This function creates a vector of features by compressing the images into vector format
def feature_vectorizer(image, size=(30,30)):
    # Resize image to (30,30), compress RGB values to list
    feature_vect = cv2.resize(image, size).flatten()
    return feature_vect

In [None]:
# Convert images to feature vectors
rawImages = []
labels = []
count = 0

for i in processed_imgs:
    pixels = feature_vectorizer(i)
    rawImages.append(pixels)
    
    # Parasitized
    if count < 13779:
        labels.append(1)
    # Uninfected
    else:
        labels.append(0)
    count+=1
rawImages = np.array(rawImages)
labels = np.array(labels)
scaler = StandardScaler()
raw_scaled = scaler.fit_transform(rawImages)
images_df = pd.DataFrame(raw_scaled)
images_df['label'] = labels
images_df

In [None]:
""" Function to provide k_fold cross validation of the model
@param dataset = the full training dataset with class labels as a pandas DataFrame
@param knn_model = the given model with known hyperparameters
@param n_partitions = the the number of partitions to use for validation"""
def cross_val(dataset, model, n_partitions):
    partitions = np.array_split(dataset, n_partitions)
    accuracies = list()
 
    # For each partition, compute accuracy for the model using it as test and train one time each
    for i in range(len(partitions)):
        test_set = partitions[i]
        train_list = list()
        # Append all partitions to train except for the one being used as test
        for j in range(len(partitions)):
            if i == j:
                continue
            train_list.append(partitions[j])
            
        # Combine DataFrames of the train_list to form the train_set
        train_set = pd.concat(train_list)

        
        X_train = train_set.drop(columns=['label'])
        y_train = train_set['label']
        
        
        # Separate training set and test set
        X_test = test_set.drop(columns=['label'])
        y_test = test_set['label']
        
        
        # Fit the model to the train_set
        model.fit(X_train, y_train)
        temp_acc = model.score(X_test, y_test)
        accuracies.append(temp_acc)
        
    return np.mean(accuracies)

# K-NN Model #

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
score = cross_val(images_df, knn, 10)
score

<h1> SVM Model </h1>

In [None]:
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(rawImages, labels, train_size = 0.8, random_state = 1)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

svm_acc = clf.score(X_test, y_test)
svm_acc

# Decision Tree Model #

In [None]:
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(rawImages, labels, train_size = 0.8, random_state = 1)
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', random_state=1)
dt.fit(X_train, y_train)

dt_acc = dt.score(X_test, y_test)
dt_y_pred = dt.predict(X_test)
dt_acc

# Performance Evaluation #

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, dt_y_pred))
print(classification_report(y_test, dt_y_pred))

In [None]:
print(confusion_matrix(y_test, svm_y_pred))
print(classification_report(y_test, svm_y_pred))

In [None]:
# Plotting error rate vs K value for KNN model
error = []

# Calculating error for K values between 1 and 20
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 20), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')