In [None]:
%pylab inline
import imageio
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.linear_model import LogisticRegression
from pathlib import Path
from sklearn.model_selection import train_test_split

import pandas as pd

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

def plotGrayImage(data):
    plt.imshow(data, cmap=plt.get_cmap('gray'), vmin=0, vmax=255)
    plt.show()

def cropImage(im, size):
    """This function crops the center of the image with the size specified by user"""
    x, y = im.shape
    middleX = round(x / 2)
    middleY = round(y / 2)
    margin = int((size / 2))
    return im[middleX - margin:middleX + margin,middleX - margin:middleX + margin]

def addBorderZeroPaddToImage(im, extraBorderWidth):
    """This function creates a zero padding to the image, the width of the border is related with the parameter 
    extraBorderWidth"""
    return np.pad(im, extraBorderWidth, 'constant', constant_values=0)

def resizeImage(im, size):
    """This normalize the image to the size specified, it simply make the image bigger with zeros and then crop the
    image to the size declared by user"""
    return cropImage(addBorderZeroPaddToImage(im, size),size)

In [None]:
def getData(numSamples=100):
    # `cwd`: current directory is straightforward
    cwd = Path.cwd()

    parasitizedPath = str(cwd) + "/cell_images/Parasitized"
    uninfectedPath = str(cwd) + "/cell_images/Uninfected"
    
    parasitedPaths = []
    paraFiles = []
    for file in os.listdir(parasitizedPath)[0:numSamples]:
        if os.path.isfile(os.path.join(parasitizedPath, file)):
            if file.endswith(".png"):
                temp = parasitizedPath + "/" + file
                gray = rgb2gray(imageio.imread(temp))
                parasitedPaths.append(temp)
                paraFiles.append(gray)

    uninfectedPaths = []
    uninFiles = []

    for file in os.listdir(uninfectedPath)[0:numSamples]:
        if os.path.isfile(os.path.join(uninfectedPath, file)):
            if file.endswith(".png"):
                temp = uninfectedPath + "/" + file
                gray = rgb2gray(imageio.imread(temp))
                uninfectedPaths.append(temp)
                uninFiles.append(gray)
    """Until now I have all paths with images files and their data. 
        The data is already classified as parasited and uninfected

        I want to make one dataframe to gather all information.  
        The dataframe will have the following columns

        * status: It tells if the sample is parasited or uninfected
        * pathfile: it is self-explanatory
        * data: the image data"""            
    
    status = np.concatenate((np.ones(len(parasitedPaths)),np.zeros(len(uninfectedPaths))))
    pathfile = parasitedPaths + uninfectedPaths
    data = np.concatenate((paraFiles, uninFiles))
    return pd.DataFrame({"pathfile": pathfile, "data": data, "status": status})

#### Obtaining data from files

In [None]:
df = getData(500)
df.head(2)

In [None]:
#### Just to see one image as a manner of example:
plotGrayImage(df["data"][0])

### Since I have notice that images have different dimmensions, I need to make them the same size.
But first I am goint be find the biggest one and make the rest the same size with zero values in the edges. 


In [None]:
maxSize = np.array([x.shape for x in df["data"]]).flatten().max()
maxSize

In [None]:
t = df["data"][0]
result = addBorderZeroPaddToImage(t, maxSize)
plotGrayImage(t)
plotGrayImage(result)
r = cropImage(result, maxSize)
plotGrayImage(r)

### Train-test-validation split

In [None]:
df_train_validation, df_test = train_test_split(df, test_size=0.1, random_state=0)

### Adjusting size of database images

In [None]:
vectorImages = pd.DataFrame(df_train_validation["data"].apply(lambda x: resizeImage(x, maxSize).ravel()).values.tolist())


### Train-test-validation split

In [None]:
vectorImages_train, vectorImages_test, labels_train, labels_test = train_test_split(vectorImages, df_train_validation["status"], test_size=0.25, random_state=0)

In [None]:
model = LogisticRegression()
model.fit(vectorImages_train, labels_train)

In [None]:
model.score(vectorImages_test, labels_test)

Since the accuracy is barely better than a flip a coin estimator, I have to change the strategy to improve the model. 

The current model does such a horrible job because the observable pattern (the one that has to be enclosed or segmented
by the classifier) is in different dimmensions in each sample so the the optimization algorithm does not where to go
in order to truly fit the model. 

In [None]:
def processToHistVectorImages(x, maxSize):
    vector = resizeImage(x, maxSize).ravel()
    return list(np.histogram(vector,bins=40)[0])[1:]

histVectorImages = pd.DataFrame(df_train_validation["data"].apply(lambda x: processToHistVectorImages(x, maxSize)).values.tolist())

In [None]:
df_test["histVectors"] = df_test["data"].apply(lambda x: processToHistVectorImages(x, maxSize)).values.tolist()

In [None]:
histVectorImages_train, histVectorImages_test, labels_train, labels_test = train_test_split(histVectorImages, df_train_validation["status"], test_size=0.25, random_state=0)

In [None]:
histModel = LogisticRegression()
histModel.fit(histVectorImages_train, labels_train)
#histModel

In [None]:
histModel.score(histVectorImages_test, labels_test)

In [None]:
# for index in range(len(df_test)):
#     predict = histModel.predict(np.array(df_test["histVectors"].iloc[index]).reshape(1,-1))
#     print("True value: " , df_test["status"].iloc[index])
#     print("  Predicted: ", predict)
#     plotGrayImage(df_test["data"].iloc[index])
#     print("-------------")