In [1]:
import numpy as np
import sys


class Iris:

    def __init__(self):
        self.m = 150
        self.n = 4
        self.k = 3

    def getK(self):
        return self.k

    def getData(self):

        try:
            data = np.genfromtxt('iris.csv', delimiter=',')
            print("Data source available.")
        except IOError:
            print("Missing dataset! Run:")
            print(
                "wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
            exit(0)
        print("Shape of original data : " + str(len(data)) +
              ", " + str(len(data[0])))
        return data

    def normalizeData(self, data):
        print("Normalizing the data to to range [0,1] now...")
        low, high = np.amin(data, axis=0), np.amax(data, axis=0)
        for j in range(self.n):
            minimum, maximum = low[j], high[j]
            for i in range(self.m):
                data[i][j] = (data[i][j] - minimum)/(maximum - minimum)
        print("Shape of the normalized data : " +
              str(len(data)) + ", " + str(len(data[0])))
        return data

    def getDissimilarityMatrix(self, data):
        print("Preparing dissimilarity matrix...")
        mat = np.zeros(shape=(self.m, self.m))
        for i in range(self.m):
            for j in range(self.m):
                # mat[i, j]= np.linalg.norm(data[i] - data[j]) --> this is slow
                mat[i, j] = ((data[i][0] - data[j][0])**2 + (data[i][1] - data[j][1]
                                                             )**2 + (data[i][2] - data[j][2])**2 + (data[i][3] - data[j][3])**2)**0.5
        print("Shape of the Dissimilarity Matrix : " +
              str(len(mat)) + ", " + str(len(mat[0])))
        return mat

    def cluster(self, mat):
        print("Calculating average dissimilarity values for each object...")
        avg = np.mean(mat, axis=0)
        print("Primary clustering...")
        clusters = []
        for i in range(self.m):
            cluster = []
            for j in range(self.m):
                if(mat[i][j] < avg[i]):
                    cluster.append(j)
            clusters.append(cluster)
        print(str(len(clusters)) + " primary clusters formed.")
        return clusters

    def removeSubsetClusters(self, clusters):
        print("Removing clusters which are a subset of other clusters...")
        P = len(clusters)
        i = 0
        while i < P:
            j = 0
            while j < P:
                if i != j:
                    if (set(clusters[i]).issubset(set(clusters[j]))):
                        clusters.remove(clusters[i])
                        P -= 1
                        i -= 1
                        break
                j += 1
            i += 1
        return clusters

    def getSimilarityMatrix(self, clusters):
        print("Creating similarity matrix...")
        p = len(clusters)
        sim = np.zeros(shape=(p, p))
        for i in range(p):
            for j in range(p):
                sim[i, j] = len(list(set(clusters[i]) & set(clusters[j]))) / \
                    len(list(set(clusters[i]) | set(clusters[j])))
        print("Shape of the Similarity Matrix : " +
              str(len(sim)) + ", " + str(len(sim[0])))
        return sim

    def mergeMaxSimilarityClusters(self, sim, clusters):
        print("Merging clusters having maximum similarity...")
        print(print(str(len(clusters)) + " clusters found."))
        val = 0
        idx_k = -1
        idx_l = -1
        p = len(clusters)
        for i in range(p):
            for j in range(p):
                if(i != j):
                    if(sim[i, j] >= val):
                        val = sim[i, j]
                        idx_k = i
                        idx_l = j

        clusters[idx_k] = list(set(clusters[idx_k]) | set(clusters[idx_l]))
        clusters.remove(clusters[idx_l])

        print(str(val) + " is the maximum similarity value and clusters " +
              str(idx_k) + ", " + str(idx_l) + " are the most similar.")
        print(str(len(clusters)) + " clusters left after this iteration.")
        return clusters



model = Iris()
data = Iris.getData(model)
data = Iris.normalizeData(model, data)
mat = Iris.getDissimilarityMatrix(model, data)
clusters = Iris.cluster(model, mat)
cluster_original = clusters
print("---------------------------------------------------------------")
while(len(clusters) != Iris.getK(model)):
    clusters = Iris.removeSubsetClusters(model, clusters)
    sim = Iris.getSimilarityMatrix(model, clusters)
    clusters = Iris.mergeMaxSimilarityClusters(model, sim, clusters)
    print("---------------------------------------------------------------")
print(str(len(clusters[0])) + ", " + str(len(clusters[1])) + ", " +
        str(len(clusters[2])) + " are the sizes of the three final clusters.\n")
print("Cluster 1 : \n")
print(clusters[0])
print("\n")
print("Cluster 2 : \n")
print(clusters[1])
print("\n")
print("Cluster 3 : \n")
print(clusters[2])
print("\n")



Data source available.
Shape of original data : 150, 4
Normalizing the data to to range [0,1] now...
Shape of the normalized data : 150, 4
Preparing dissimilarity matrix...
Shape of the Dissimilarity Matrix : 150, 150
Calculating average dissimilarity values for each object...
Primary clustering...
150 primary clusters formed.
---------------------------------------------------------------
Removing clusters which are a subset of other clusters...
Creating similarity matrix...
Shape of the Similarity Matrix : 14, 14
Merging clusters having maximum similarity...
14 clusters found.
None
0.9789473684210527 is the maximum similarity value and clusters 13, 10 are the most similar.
13 clusters left after this iteration.
---------------------------------------------------------------
Removing clusters which are a subset of other clusters...
Creating similarity matrix...
Shape of the Similarity Matrix : 13, 13
Merging clusters having maximum similarity...
13 clusters found.
None
0.9684210526315

In [17]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
def notkMeansClustering(clusters):
    mean = []
    for i in range(3):
        sum = 0
        for j in range(len(clusters[i])):
            sum += clusters[i][j]
        mean.append(sum/len(clusters[i]))

    locations = [[] for _ in range(150)]
    for i in range(150):
        for j in range(3):
            for k in range(len(clusters[j])):
                if (clusters[j][k] == i):
                    locations[i].append(j)
                    break

    dummy = [[] for _ in range(3)]
    for i in range(150):
        obj = i
        max_similarity = 0 # find the maximum similarity across all the values of locations[i] for current obj
        max_similarity_idx = -1 # find the cluster to which the object belongs the most
        for j in range(len(locations[i])):
            location = locations[i][j] # location is either 0 or 1 or 2
            for k in range(len(clusters[location])):
                # find the current object in clusters[location]
                if (clusters[location][k] == obj):
                    # then compare the similarity of clusters[location][k] with mean of clusters[location]
                    num1 = clusters[location][k]
                    num2 = mean[location]

                    # Experimenting with possible similarity functions

                    #similarity = min(num1, num2)/ max(num1, num2) # 51, 31, 68
                    #similarity = abs(np.log(min(num1, num2))/np.log(max(num1, num2))) # 50, 32, 68
                    #similarity = abs((np.log(num1))**2-(np.log(num2))**2)**0.5 # 43, 46, 61
                    #similarity = ((np.log(num1))**2-(np.log(num2))**2)**0.5 # 34, 27, 89
                    #similarity = abs(((np.log(num1))**2-(np.log(num2))**2)**0.5) # 34, 27, 89
                    similarity = abs(np.log(num1)-np.log(num2)) # 44, 46, 60
                    #similarity = (np.log(num1)-np.log(num2))**2 # 44, 46, 60
                    #similarity = abs(np.log(num1)/np.log(num2)) # 77, 38, 35
                    #similarity = abs(num1-num2) # 41, 45, 64
                    #similarity = (num1 - num2)**2 # 41, 45, 64
                    #similarity = np.dot(num1, num2)/(np.linalg.norm(num1)*np.linalg.norm(num2)) # 77, 38, 35

                    # get the maximum similarity value for that object across all values in locations[i]
                    # the object will get appended to dummy[max_similarity_idx]
                    if (similarity >= max_similarity):
                        max_similarity = similarity
                        max_similarity_idx = location
                    break
        dummy[max_similarity_idx].append(i) # append the current object to the cluster with which it is most similar

    clusters = dummy
    return clusters

clusters = [
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 55, 57, 59, 60, 62, 64, 67, 69, 71, 79, 80, 81, 82, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 96, 98, 99, 106],
    [1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 20, 23, 24, 25, 26, 29, 30, 31, 34, 35, 38, 41, 42, 43, 45, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 106, 111, 113, 119, 121, 123, 126, 127, 133, 134, 138, 142, 146, 149],
    [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
]

clusters = notkMeansClustering(clusters)
print(str(len(clusters[0])) + ", " + str(len(clusters[1])) + ", " + str(len(clusters[2])) + " are the sizes of the three final clusters.\n")
print("Cluster 1 : \n")
print(clusters[0])
print("\n")
print("Cluster 2 : \n")
print(clusters[1])
print("\n")
print("Cluster 3 : \n")
print(clusters[2])
print("\n")

def plot(clusters):
    for i in range(3):
        cluster = clusters[i]
        


44, 46, 60 are the sizes of the three final clusters.

Cluster 1 : 

[0, 4, 5, 10, 14, 15, 16, 17, 18, 19, 21, 22, 27, 28, 32, 33, 36, 37, 39, 40, 44, 46, 48, 67, 69, 71, 79, 80, 81, 82, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 96, 98, 99, 106]


Cluster 2 : 

[1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 20, 23, 24, 25, 26, 29, 30, 31, 34, 35, 38, 41, 42, 43, 45, 47, 49, 83, 86, 91, 97, 101, 103, 111, 113, 119, 121, 123, 126, 127, 133, 134, 138, 142, 146, 149]


Cluster 3 : 

[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 70, 72, 73, 74, 75, 76, 77, 78, 100, 102, 104, 105, 107, 108, 109, 110, 112, 114, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148]




In [3]:
def removeSubsetClusters(clusters):
    print("Removing clusters which are a subset of other clusters...")
    P = len(clusters)
    i = 0
    while i < P:
        j = 0
        while j < P:
            if i != j:
                if (set(clusters[i]).issubset(set(clusters[j]))):
                    clusters.remove(clusters[i])
                    P -= 1
                    i -= 1
                    break
            j += 1
        i += 1
    return clusters

clusters = [
			[1,2,3,4,5],
			[1],
			[1,2,4],
			[1,2],
			[4,5],
			[4]
		]
p = removeSubsetClusters(clusters)
print(p)  

Removing clusters which are a subset of other clusters...
[[1, 2, 3, 4, 5]]


In [1]:
import numpy as np

data = np.genfromtxt('iris.csv', delimiter=',')
m = 150
n = 4
k = 3

low = np.amin(data, axis=0)
high = np.amax(data, axis=0)
for j in range(n):
    minimum = low[j]
    maximum = high[j]
    for i in range(m):
        data[i][j] = (data[i][j] - minimum)/(maximum - minimum)

mat = np.zeros(shape=(m, m))
for i in range(m):
    for j in range(m):
        mat[i, j] = ((data[i][0] - data[j][0])**2 + (data[i][1] - data[j][1]
                                                        )**2 + (data[i][2] - data[j][2])**2 + (data[i][3] - data[j][3])**2)
        mat[i, j] = mat[i, j]**0.5

avg = np.zeros(shape=(m))
for i in range(m):
    for j in range(m):
        avg[i] += mat[i][j]
    avg[i] = avg[i] / m

clusters = []
for i in range(m):
    cluster = []
    for j in range(m):
        if(mat[i][j] < avg[j]):
            cluster.append(i)
    clusters.append(cluster)
clusters = np.asarray(clusters)
clusters = np.unique(clusters)
print(str(len(clusters)) + " primary clusters formed.\n")

150 primary clusters formed.



In [2]:
idx = []
count = 0
x = len(clusters)
for i in range(x):
    for j in range(x):
        if(i != j):
            if(set(clusters[j]).issubset(set(clusters[i]))):
                count+=1
                idx.append(j)
idx = np.unique(idx)
idx = np.asarray(idx)
print(idx)
print(len(idx))
print(count)
#if(idx.size > 0):
#    clusters_copy = np.delete(clusters, idx, 0)
#print(str(len(clusters_copy)) + " clusters left after reduction.")

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149]
150
5504


In [None]:
import numpy as np
class Iris:

    def __init__(self):
        self.m = 150
        self.n = 4
        self.k = 3
    
    def getK(self):
        return self.k

    def getData():

        try:
            data = np.genfromtxt('iris.csv', delimiter=',')
            print("Data prepared.\n") 
            #print("Original Data : \n")
            #print(data)
            #print("\n")
        except IOError:
            print("Missing dataset! Run:")
            print(
                "wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
            data = -1
        #print(data)
        #print("\n")
        print("Shape of the data : " + str(len(data)) + "\n")
        return data

    def normalizeData(self, data):
        print("Normalizing the data now...\n")
        low = np.amin(data, axis=0)
        high = np.amax(data, axis=0)
        for j in range(self.n):
            minimum = low[j]
            maximum = high[j]
            for i in range(self.m):
                data[i][j] = (data[i][j] - minimum)/(maximum - minimum)
        #print("Normalized Data : \n")
        #print(data)
        #print("\n")
        print("Shape of the normalized data : " + str(len(data)) + "\n")
        return data

    def getDissimilarityMatrix(self, data):
        print("Preparing dissimilarity matrix...\n")
        mat = np.zeros(shape=(self.m, self.m))
        for i in range(self.m):
            for j in range(self.m):
                mat[i, j] = ((data[i][0] - data[j][0])**2 + (data[i][1] - data[j][1]
                                                             )**2 + (data[i][2] - data[j][2])**2 + (data[i][3] - data[j][3])**2)
                mat[i, j] = mat[i, j]**0.5
        #print("Dissimilarity Matrix : \n")
        #print(mat)
        #print("\n")
        print("Shape of the Dissimilarity Matrix : " + str(len(mat)) + "\n")
        return mat

    def cluster(self, mat):
        print("Calculating average dissimilarity values for each object...\n")
        avg = np.zeros(shape=(self.m))
        for i in range(self.m):
            for j in range(self.m):
                avg[i] += mat[i][j]
            avg[i] = avg[i] / self.m

        print("Primary clustering...\n")
        clusters = []
        for i in range(self.m):
            cluster = []
            for j in range(self.m):
                if(mat[i][j] < avg[j]):
                    cluster.append(j)
            clusters.append(cluster)
        clusters = np.asarray(clusters)
        print(str(len(clusters)) + " primary clusters formed.\n")
        return clusters
    '''
    def removeSubsetClusters(self, clusters):
        print("Removing clusters which are a subset of other clusters...\n")
        idx = []
        x = len(clusters)
        for i in range(x):
            for j in range(x):
                if(i != j):
                    if(len(np.setdiff1d(clusters[i], clusters[j])) == 0):
                        for k in range(len(clusters[j])):
                            clusters[j][k] = 0
                        idx.append(j)
        idx = np.unique(idx)
        clusters = np.delete(clusters, idx, 0)
        print(str(len(clusters)) + " clusters left after reduction.\n")
        return clusters
    '''
    def removeSubsetClusters(self, clusters):
        print("Removing clusters which are a subset of other clusters...")
        idx = []
        x = len(clusters)
        for i in range(x):
            for j in range(x):
                if(i != j):
                    a = clusters[i]
                    b = clusters[j]
                    b = set(b)
                    #a = a.tolist()
                    ismember = b.issubset(a)
                    if(ismember):
                        b_id = np.where(ismember)[0]
                        idx.append(b_id)
        idx = np.unique(idx)
        if(idx.size > 0):
            clusters = np.delete(clusters, idx, 0)
        print(str(len(clusters)) + " clusters left after reduction.")
        return clusters

    def getSimilarityMatrix(clusters):
        print("Creating similarity matrix...\n")
        p = len(clusters)
        sim = np.zeros(shape=(p, p))
        for i in range(p):
            for j in range(p):
                sim[i, j] = len(np.intersect1d(clusters[i], clusters[j])
                                ) / len(np.union1d(clusters[i], clusters[j]))
        print("Similarity Matrix :\n")
        print(sim)
        print("\n")
        print("Shape of the Similarity Matrix : " + str(len(sim)) + "\n")
        return sim

    def mergeMaxSimilarityClusters(sim, clusters):
        print("Merging clusters having maximum similarity...\n")
        print(print(str(len(clusters)) + " clusters found.\n"))
        val = 0
        idx_k = 0
        idx_l = 0
        p = len(clusters)
        for i in range(p):
            for j in range(p):
                if(i != j):
                    if(sim[i, j] > val):
                        val = sim[i, j]
                        idx_k = i
                        idx_l = j

        print(str(val) + " is the maximum similarity value and clusters " +
              str(idx_k) + ", " + str(idx_l) + " are the most similar.")
        merged_clusters = np.union1d(clusters[idx_k], clusters[idx_l])
        ids = [idx_k, idx_l]
        clusters = np.delete(clusters, ids, 0)
        dummy = [[] for _ in range(len(clusters))]
        for i in range(len(clusters)):
            for j in range(len(clusters[i])):
                dummy[i].append(clusters[i][j])
        dummy.append(merged_clusters)
        clusters = dummy
        print(str(len(clusters)) + " clusters left now.\n")
        return clusters

model = Iris()
data = Iris.getData()
data = Iris.normalizeData(model,data)
mat = Iris.getDissimilarityMatrix(model, data)
clusters = Iris.cluster(model, mat)
cluster_original = clusters

In [None]:
while(len(clusters)!=Iris.getK(model)):
    clusters = Iris.removeSubsetClusters(model, clusters)
    sim = Iris.getSimilarityMatrix(clusters)
    clusters = Iris.mergeMaxSimilarityClusters(sim, clusters)
print(str(len(clusters[0])) + ", " + str(len(clusters[1])) +", " + str(len(clusters[2])) + " are the sizes of the three clusters.\n")

In [None]:
import numpy as np
a = [[1,2,3],[4,5,6],[7,8,9],[1,2,3]]
a = np.asarray(a)
b = [1,2,3]
b = set(b)
ismember = [b.issubset(row) for row in a.tolist()]
idx = np.where(ismember)[0]
idx = np.asarray(idx)
idx[0]

In [None]:
i = 5
j = 57
#print(str(np.setdiff1d(clusters[i], clusters[j])))
a = set(clusters[i])
b = set(clusters[j])
print(a)
print(b)
print(b.issubset(a))