# Banana Freshness and Counting Tasks

We will solve two simple tasks: 

1. estimating the freshness of a banana;
2. counting the number of bananas in a image

We solve them by analysing the image pixel values with machine learning algorithms.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image

# 1. Extract Image

In [None]:
import zipfile
with zipfile.ZipFile("data.zip","r") as zip_ref:
    zip_ref.extractall(".")

# 2. Load Images

In [None]:
image_file = './Counting/1-1-F.png'

example = Image.open(image_file)
print("the image is stored as a %s object" % (type(example)) )
print("The size of the image is", example.size)

plt.imshow(example)
plt.show()

## 2.1 read in freshness data

In [None]:
# list all the files in 'Counting' folder
folder = './Freshness/'
images = os.listdir(folder)
for image_filename in images:
    print(image_filename) # image_file is a string

In [None]:
# store all data in a dictionary called 'freshness_data'
# the key will be the image filename.
# the value will be the list of two value: the image and its label (float)

## NOTE: the label of the image is its filename without the 4-letter extension postfix (.png)
## ie: 0.15.png means the image label is 0.15

freshness_data = {}
for image_filename in images:
    if image_filename.endswith('.png'):
        full_path = os.path.join(folder, image_filename)
        img = Image.open(full_path)
        label = float(image_filename.rstrip('.png'))
        freshness_data[image_filename] = [img, label]

In [None]:
# plot the images by label

## first, sort all images by label(freshness)
data_list = []
for image_filename, data in freshness_data.items():
    data_list.append(data)
    
data_list.sort(key=lambda x: x[1], reverse=True)

In [None]:
## second, plot the images and print out their labels.
for data in data_list:
    image, label = data
    plt.imshow(image)
    plt.title('Freshness %.2f' % label)
    plt.show()

## 2.2 read in banana counting images

In [None]:
# list all the files in 'Counting' folder
folder = './Counting/'
images = os.listdir(folder)
for image_filename in images:
    print(image_filename) # image_file is a string

In [None]:
# store all data in a dictionary called 'counting_data'
# the key will be the image filename.
# the value will be the list of three values: the image and its two labels 
# the two labels are: 'the number of banana in the image' (integer) and 'if the bananas are fresh or rotten' (bool)

## NOTE: the labels of a image are the first and the last letters of its filename.
## example: counting_data { '1-1-F.png' : [ image_object, 1, 'F' ], .... }

counting_data = {}
for image_filename in images:
    if image_filename.endswith('.png'):
        pass # TODO

In [None]:
# plot image by the number of banana

## first, regroup images according to their labels
## label_group should be a dictionary with three keys: 1, 2 3
## e.g. the value of key X is a list, storing the images with X bananas

label_group = {}
for image_filename, data in counting_data.items():
    pass # TODO

In [None]:
## second, plot the images of each label
for label in [1, 2, 3]:
    img_list = label_group[label]
    # TODO

# 3. resize image to a uniform size

In [None]:
plt.imshow(example)
plt.show()
print("the original size is", example.size)
example_small = example.resize([224, 224])
plt.imshow(example_small)
plt.show()
print("the new size is", example_small.size)

In [None]:
# resize all freshness data
for image_filename, data in freshness_data.items():
    image, label = data
    image = image.resize([224, 224])
    freshness_data[image_filename] = [image, label]

In [None]:
# resize all counting data
for image_filename, data in counting_data.items():
    pass # TODO

# Part 2

In [None]:
# segment banana pixel from background

## for simplification, convert to image into gray scale
## compute a foreground/background threshold as the mean value of all gray-scale pixel
## given the threshold, compute a binary mask having 1 for foreground pixel (banana) and 0 otherwise
## output a dictionary [seg_data] as { '0.3.png' : {'img_gray':[numpy.array],'mask_fg':[numpy.array]}, .... }

def segment(data,verbose=False):
    seg_data = {}
    for k in data.keys():
        img_gray = None 
        
        thres_bg = None
        mask_fg = None
        
        seg_data[k] = None

        if verbose:
            print('----- {} ------'.format(k))
            plt.imshow(img_gray,cmap=plt.get_cmap('gray'))
            plt.show()
            plt.imshow(mask_fg,cmap=plt.get_cmap('gray'))
            plt.show()
    
    return seg_data 

In [None]:
# segment images in [freshness_data] and [counting_data]
## set verbose=True to visualize the result

seg_freshness = segment(freshness_data,verbose=False)
seg_counting = segment(counting_data,verbose=False)

## Freshness prediction (Regression)

### Feature extraction

In [None]:
# compute [skin_color] for each image
# collect [freshness] value for each image

skin_color = []
freshness = []

for k in seg_freshness.keys():
    ## [avg_color] is the average value of all foreground pixels based on [mask_fg]
    ## [r] is the target freshness value stored in [freshness_data] 
    avg_color = None    # TODO
    r = None            # TODO
    # TODO: store [avr_color], [r] into [skin_color], [freshness] lists respectively

# construct numpy.array from above lists with approriate dimensions
skin_color = np.array(skin_color)[:,np.newaxis]
freshness = np.array(freshness)

# plot data distribution
plt.scatter(skin_color,freshness)
plt.title('Freshness prediction')
plt.xlabel("yellowness")
plt.ylabel("freshness")
plt.show()

### Linear Regression

In [None]:
# plot predictions vs ground_truth for model evaluation
def plot_pred(x,y,pred_x, pred_y, title = ''):
    plt.scatter(x,y)
    plt.plot(pred_x,pred_y,c='red')
    plt.title(title)
    plt.xlabel("yellowness")
    plt.ylabel("freshness")
    plt.show()

In [None]:
# fit a linear Regession model

from sklearn.linear_model import LinearRegression

reg = None # TODO
pred_skin_color = np.linspace(skin_color.min(), skin_color.max(), 500)[:,np.newaxis]
pred_freshness = reg.predict(pred_skin_color)

plot_pred(skin_color,freshness,pred_skin_color,pred_freshness,'linear regression')

### Quaratic Regression

In [None]:
# expand feature vector by adding higher order terms

## create a new numpy array consisting of [x, x^2, ..., x^degree]

def feature_transform(x,degree):
    pass # TODO

In [None]:
# fit a linear Regession model on the transformed feature with degree 2

degree = 2

reg = None # TODO
pred_skin_color = np.linspace(skin_color.min(), skin_color.max(), 500)[:,np.newaxis]
pred_freshness = reg.predict(feature_transform(pred_skin_color,degree))

plot_pred(skin_color,freshness,pred_skin_color,pred_freshness,'quadratic regression')

### Higher Polynomial Regression

In [None]:
# fit a linear Regession model on the transformed feature with degree 10

degree = 10

reg = None # TODO
pred_skin_color = np.linspace(skin_color.min(), skin_color.max(), 500)[:,np.newaxis]
pred_freshness = reg.predict(feature_transform(pred_skin_color,degree))

plot_pred(skin_color,freshness,pred_skin_color,pred_freshness, title = 'High-Order Polynomial Regression')

## Banana Count and Quality (Clustering)

In [None]:
# compute [skin_color] and [fg_area] for each image
# collect [count] (number of banana) and [quality] (Fresh or Rotten) for each image

skin_color = []
fg_area = []
count = []
quality = []

for k in seg_counting.keys():
    ## [avg_color] is the average value of all foreground pixels based on [mask_fg]
    ## [area] is the number of foreground pixels based on [mask_fg]
    ## [c] and [q] are banana count and quality stored in [counting_data] 
    avg_color = None    # TODO
    area = None         # TODO
    c = None            # TODO
    q = None            # TODO

    # TODO: store [avr_color], [area], [c], [q] into [skin_color], [fg_area], [count], [quality] lists respectively

# construct numpy.array from above lists with approriate dimensions
skin_color = np.array(skin_color)[:,np.newaxis]
fg_area = np.array(fg_area)[:,np.newaxis]

# plot data distribution
plt.scatter(skin_color,fg_area)
for i, (c,q) in enumerate(zip(count,quality)):
    plt.annotate("{}_{}".format(c,q), (skin_color[i], fg_area[i]),size=15, c='red')

plt.title('Banana Count and Quality')
plt.xlabel("yellowness")
plt.ylabel("foreground area")
plt.show()

### KMeans

In [None]:
# Normalize input feature in the range of [0,1]

## compute range of x (max(x)-min(x))
## compute mean of x
## normalize data as (x-mean_x)/range_x

def normalize(x):
    pass
    # TODO

In [None]:
from sklearn.cluster import KMeans

random_state = 170
X = None        # TODO: compute input features by concatenate [skin_color] and [fg_area] among the last dimension 
y_pred = None   # TODO: fit a Kmean with 4 clusters

# plot the predicted clusters
plt.scatter(X[:, 0], X[:, 1], c=y_pred)

for i, (c,q) in enumerate(zip(count,quality)):
    plt.annotate("{}_{}".format(c,q), (X[i, 0], X[i, 1]),size=15, c='red')

plt.title('Banana Count and Quality')
plt.xlabel("yellowness")
plt.ylabel("foreground area")
plt.show()