In [1]:
# Test to represent a small barack image as two subclusters based on color
# notes: medium barack image uses too much memory/breaks

import numpy as np
from detan.detan import AssignmentAnnealing, assignment_iteration
from skimage import io, color
# from sklearn.metrics import pairwise_distances
from scipy.spatial import distance_matrix

# read the input RGB image
# rgb = io.imread("barack-2.jpg",plugin='matplotlib')
rgb = io.imread("barack-3.jpg",plugin='matplotlib')

# print(rgb.shape)

# convert RGB to LAB
img = color.rgb2lab(rgb)

img_h = img.shape[0] # Image Height
img_w = img.shape[1] # Image Width

N = img_h * img_w  # Total number of pixels in the image
print("pixels #")
print(N)
flat_img = img.reshape((N,3))

# take in a group of pixels, and sort them into two sub-clusters based on color

# function to calculate distances between each pixel 
# pairwise_distances(flat_img,img,metric="euclidean")
dist_mat = distance_matrix(flat_img, flat_img, p=2)
print("distant matrix:")
print(dist_mat)

distances = dist_mat

# # decide how many groups there are
groups = 2

# # The initial assignment expectations should be random, and must sum to 1 across
# # each row. There should be no identical entries in a given row.
initial_assignments = 0.5 + 0.1 * (np.random.random((N,groups)) - 0.5)
row_sum = np.tile(initial_assignments.sum(1), (groups, 1)).T
initial_assignments = initial_assignments/row_sum

# # This is the state of our annealling.
annealer = AssignmentAnnealing(assignment_iteration(distances), initial_assignments, 0.73)

# # Tolerance for deciding when to lower the temperature. We also need to keep
# # track of the old assignments.
tolerance = 1e-6
old_assignments = initial_assignments

# # For the sake of simplicity, I've picked an arbitrary number of temperature
# # steps.
# ^change temp to variance

for _ in range(10): # 10 works fine, 20 breaks code w/ NaN's
    # Iterating over the annealer object produces the new assignment
    # expectations.
    for new_assignments in annealer:
        # If none of the assignments change by more than the tolerance, drop the
        # temperature.
        if np.abs(new_assignments - old_assignments).max() < tolerance:
            break
        old_assignments = new_assignments
    # Next temperature.
    annealer.cool()

# The raw assignment expectation values.
print("Raw assignment expectations:")
print(annealer.assignments)

print()

# The "ideal" clustering results.
print("Ideal clustering results:")
print(annealer.assignments > 1e-50)

pixels #
3700
distant matrix:
[[ 0.          0.36353092  1.30998208 ... 74.28180156 73.21147831
  73.21147831]
 [ 0.36353092  0.          1.0083307  ... 73.91845294 72.84822351
  72.84822351]
 [ 1.30998208  1.0083307   0.         ... 73.12912668 72.05403744
  72.05403744]
 ...
 [74.28180156 73.91845294 73.12912668 ...  0.          1.23516018
   1.23516018]
 [73.21147831 72.84822351 72.05403744 ...  1.23516018  0.
   0.        ]
 [73.21147831 72.84822351 72.05403744 ...  1.23516018  0.
   0.        ]]
Raw assignment expectations:
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]

Ideal clustering results:
[[ True False]
 [ True False]
 [ True False]
 ...
 [False  True]
 [False  True]
 [False  True]]


In [3]:
# Data processing 
import pandas as pd
import numpy as np
# # Visualization
# import matplotlib.pyplot as plt
# import seaborn as sns
# Dataset
from sklearn import datasets
# Dimensionality reduction
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
# Modeling
# from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
# from sklearn.mixture import GaussianMixture
# from sklearn.cluster import DBSCAN

In [4]:
# Load data
iris = datasets.load_iris()
# Show data information
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [5]:
# Print feature and target information
print('The feature names are:', iris['feature_names'])
print('The target names are:', iris['target_names'])
print('The target values are:', iris['target'])

The feature names are: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
The target names are: ['setosa' 'versicolor' 'virginica']
The target values are: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
# Put features data into a dataframe
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Add target to the dataframe 
df['target'] = iris.target
# Data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [7]:
# Check counts of each category
df['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [8]:
# Remove target for the clustering model
X = df[df.columns.difference(['target'])]

In [11]:
# Hierachical clustering model
hc = AgglomerativeClustering(n_clusters = 3)
# Fit and predict on the data
y_hc = hc.fit_predict(X)
# Save the predictions as a column
df['y_hc']=y_hc
# Check the distribution
df['y_hc'].value_counts()

0    64
1    50
2    36
Name: y_hc, dtype: int64