In [1]:
import numpy as np
import pandas as pd
import sys
import math
from scipy.spatial import distance

# read dataset (numerical values) as dataframe pandas
def read_dataset(path, delimiter=','):
    return pd.read_csv(path, sep=delimiter)

def euclidean_distance(point_one, point_two):
    return distance.euclidean(point_one, point_two)

def compute_nearest_neighbors_distance(dataset, sample):
    rows = len(dataset)
    srows = len(sample)
    NN = []
    objects = []
    for point in range(0, rows):
        min_distance = sys.float_info.max
        idx = 0
        for neighbor in range(0, srows):
            if neighbor == point:
                continue
            distance = euclidean_distance(dataset.iloc[point,], sample.iloc[neighbor,])
            if distance < min_distance:
                idx = neighbor
                min_distance = distance
        objects.append(point)
        NN.append(idx)
    return [objects, NN]                    

def NNDist(k, n, d, extent):
    return (((k / n)**(1/d)) * extent)
   
def distance_bubbles(rep, n, extent, nnDist, i, j):
    distance = euclidean_distance(rep.iloc[i,], rep.iloc[j,])
    if (distance - (extent[i] + extent[j])) > 0.0:
        return (distance - (extent[i] + extent[j]) + (nnDist[i] + nnDist[j]))
    else:
        return max(nnDist[i], nnDist[j])
    
def compute_databubbles(file, numberOfBubbles=10, delimiter=','):
    dataset = read_dataset(file, delimiter)
    # select k (numberOfBubbles) sample from dataset.
    idx = np.random.randint(0, len(dataset), size=numberOfBubbles)
    sampleData = dataset.iloc[idx]
    nearest_objects = compute_nearest_neighbors_distance(dataset, sampleData)
    dataset['NN'] = nearest_objects[1]
    rep = dataset.groupby('NN').mean()
    extent = dataset.groupby('NN').std().mean(axis=1)
    n = dataset.groupby('NN').count().iloc[:,0]
    nnDist = NNDist(1, len(sampleData), len(sampleData.columns), extent)
    nnDist = nnDist.fillna(0.0)
    #print("Compute distance (example) between bubble 0 and bubble 3: ", distance_bubbles(rep, n, extent, nnDist, 0, 3))
    return [rep, extent, n, nnDist]

In [2]:
bubbles_set = compute_databubbles("/home/joelson/iris.csv", numberOfBubbles=15, delimiter=',')

[         5.1       3.5       1.4       0.2
 NN                                        
 0   4.400000  2.800000  1.275000  0.200000
 1   6.866667  3.100000  5.533333  2.166667
 2   6.385714  2.800000  5.485714  1.714286
 3   7.316667  2.983333  6.033333  1.916667
 4   4.877778  3.455556  1.505556  0.255556
 5   7.600000  3.733333  6.400000  2.233333
 6   6.416667  3.050000  4.758333  1.641667
 7   5.825926  2.725926  4.611111  1.548148
 8   5.328571  4.000000  1.342857  0.257143
 9   5.561538  2.469231  3.792308  1.115385
 10  5.125000  3.325000  1.505000  0.245000
 11  7.666667  2.800000  6.733333  2.133333
 12  5.066667  2.533333  3.400000  1.166667
 13  6.533333  3.133333  5.722222  2.300000
 14  6.800000  3.050000  5.150000  2.300000, NN
 0     0.156445
 1     0.106489
 2     0.203924
 3     0.192515
 4     0.186979
 5     0.256922
 6     0.269282
 7     0.308292
 8     0.243821
 9     0.227612
 10    0.194262
 11    0.140810
 12    0.242982
 13    0.180877
 14    0.070711
 dtype: 