In [1]:
# python general
import numpy as np
import random
import math
from PIL import Image, ImageFilter


In [2]:
import os
import numpy as np
import math
from PIL import Image, ImageFilter
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import pickle
import sys
import scipy.stats

import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

import box




In [3]:
import utilities
import importlib

In [4]:

class Clustering:

    def __init__(self):
        print("----Clustering----")

    def loadData(self, image_diff, rgb_all): 
        # load. binary and transposed
        pixels_1, pixels_2 = None, None
        pixels_diff = utilities.loadImageAndProcess(image_diff, change_to_binary=True, transpose = True, rgb=rgb_all[2]) 
        return pixels_1, pixels_2, pixels_diff

    def DBScanClustering(self, points, eps=1.7, min_samples=7):
        # points is a 2d numpy array
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(points)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        print("n_clusters = ", n_clusters)
        #print("labels = ", labels)
        #print("len:core_samples_mask, len:label = ", len(core_samples_mask), len(labels))
        return labels, core_samples_mask, n_clusters


    def showClusterPlot(self, labels, X, core_samples_mask, shap, save_path_prefix=None, show_plot=True):
        # Black removed and is used for noise instead.
        unique_labels = set(labels)
        colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, 1]
                continue # ignore outliers
            class_member_mask = (labels == k)

            xy = X[class_member_mask & core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)

            xy = X[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6)

        x_ticks = range(0,shap[0],10)
        y_ticks = range(0,shap[1],10)
        plt.xticks(x_ticks, x_ticks )
        plt.yticks( y_ticks, y_ticks )

        if save_path_prefix is not None:
            plt.savefig(save_path_prefix+"_plot.jpg")
        if show_plot:
            #plt.title('Estimated number of clusters: %d' % n_clusters_)
            plt.show()    
        plt.gcf().clear()


    def solveClustering(self, image_diff, save_path_prefix, show_plot, eps, min_samples, compute_or_load="compute"): 

        # load data
        rgb_all = [True, True, True]
        pixels_1, pixels_2, pixels_diff = self.loadData(image_diff, rgb_all)
        pixels_all = [None, None, pixels_diff]
        print("------------------------------")
        
        # change to 2d points list
        points_2d_all = utilities.getPixelCoordinates(pixels_all[2], flip_y=True)

        # perform clustering. high density will give high precision but may miss on few instancs. Low density requirement may lead to noise
        points_diff = points_2d_all
        if compute_or_load=="compute":
            labels, core_samples_mask, num_clusters = self.DBScanClustering(points_diff, eps=eps, min_samples=min_samples)
        else:
            print("Loading clustering data from ", save_path_prefix+"_vals.obj")
            labels,points_diff,core_samples_mask,_ = pickle.load( open(save_path_prefix+"_vals.obj","rb") ) 

        # dump fpr later
        if compute_or_load=="compute":
            print("Saving clustering data to ", save_path_prefix+"_vals.obj")
            pickle.dump( [labels,points_diff,core_samples_mask,pixels_diff.shape], open(save_path_prefix+"_vals.obj","wb") )

        # plot
        self.showClusterPlot(labels, points_diff, core_samples_mask, pixels_diff.shape, save_path_prefix=save_path_prefix, show_plot=show_plot)

        if compute_or_load=="compute":
            return num_clusters


In [5]:
clustering_diff = Clustering()
eps = 20 #params.eps # 1.7, 
min_samples= 9 #params.min_samples #7

data_path =  "../../data/"
img_path = data_path + "resized_images/"
cluster_data_path = data_path + "cluster_images/" #create this directory if it does not exists
box_data_path = data_path + "boxed_images/" #create this directory if it does not exists
clustering_info = "params_eps_" + str(eps) + "min_samples_" + str(min_samples) + "_"

all_fnames = os.listdir(img_path)
all_fnames = [f.replace("_diff.jpg","") for f in all_fnames if f.count(".jpg")>0]
print("all_fnames cnt = ", len(all_fnames))
num_clusters_all = []
num_cluster_cluster_to_cnt = {}
all_rectangles = {}

def solve_image(img_id):
    img = img_id
    imgdiff = img_path + img + '_diff.jpg'
    save_path_prefix = cluster_data_path + clustering_info + img
    num_clusters = -1
    num_clusters = clustering_diff.solveClustering(imgdiff, save_path_prefix, False, eps, min_samples)
    num_clusters_all.append(num_clusters)
    rectangles_coordinates = box.findBoxPoints(save_path_prefix, ignore_noise=True)
    all_rectangles[img] = rectangles_coordinates
    num_cluster_cluster_to_cnt[img] = num_clusters
    print("rectangles_coordinates = ", rectangles_coordinates)
    utilities.drawRectanglesOnImage(image_path=img_path+img+".png", rectangles_coordinates=rectangles_coordinates, save_path=box_data_path + clustering_info + img+".jpg", show_image=False)
    utilities.drawRectanglesOnImage(image_path=img_path+img+"_2.png", rectangles_coordinates=rectangles_coordinates, save_path=box_data_path + clustering_info + img+"_2.jpg", show_image=False)
    utilities.drawRectanglesOnImage(image_path=img_path+img+"_diff.jpg", rectangles_coordinates=rectangles_coordinates, save_path=box_data_path + clustering_info + img+"_diff.jpg", show_image=False)
        

for i in all_fnames:
    img = str(i)
    img='42' ## for demonstration only, fixing image id. comment this line to run on all fnames
    solve_image(img)
    print("===================")
    break

# Uncomment follwing lines to store info when running for all images
#print(scipy.stats.describe(num_clusters_all) )
#pickle.dump(num_cluster_cluster_to_cnt, open(data_path + "clustering_data/" + clustering_info+'_num_cluster_cluster_to_cnt.pickle','wb'))
#pickle.dump(all_rectangles, open(cluster_data_path + clustering_info + "_allrectangles.obj", "wb") )



----Clustering----
all_fnames cnt =  13077
image_name,rgb =  ../../data/resized_images/42_diff.jpg True
------------------------------
n_clusters =  3
Saving clustering data to  ../../data/cluster_images/params_eps_20min_samples_9_42_vals.obj
rectangles_coordinates =  [[176, 196, 75, 125], [155, 164, 182, 201], [39, 83, 47, 119]]
Saving to  ../../data/boxed_images/params_eps_20min_samples_9_42.jpg
Saving to  ../../data/boxed_images/params_eps_20min_samples_9_42_2.jpg
Saving to  ../../data/boxed_images/params_eps_20min_samples_9_42_diff.jpg
