In [1]:
import numpy as np
import time
import pandas as pd 
import matplotlib.pyplot as plt
import math 
import random
import scipy.io as spio
import collections
from sklearn.model_selection import train_test_split

columns = ['coluna 1', 'coluna 2']
train = pd.read_csv("cluster.dat", decimal=".", sep=' ', names=columns)
eps = 0.005
minpts = 10

In [2]:
def normalize(data):
  normData = data.copy()
  for i in range(data.shape[1]):
    normData.iloc[:,i] /= np.linalg.norm(data.iloc[:,i])
  return normData

In [3]:
def distance(i, j):
    return np.linalg.norm(i - j)

In [4]:
def ExpandClusters(data, points_labels, id_seed, neighbor_points, C, epsilon, min_pts, core_points):
    # coloca o id do cluster no ponto atual
    points_labels[id_seed] = C
   
    i = 0
    while i < len(neighbor_points):        
        next_position = neighbor_points[i]
        if points_labels[next_position] == -1:
           points_labels[next_position] = C
        
        # se o ponto vizinho ainda nao tinha cluster agora ele tem
        elif points_labels[next_position] == 0:
            points_labels[next_position] = C
            
            pn_neighbor = [i for i in range(len(data)) if np.linalg.norm(data[i,:]-data [next_position,:]) <= epsilon]
            #se o ponto tem filhos, adiciona eles na fila 
            if len(pn_neighbor) > min_pts:
                neighbor_points = neighbor_points + pn_neighbor
                core_points.append(next_position) # = np.append(core_points, next_position,dtype = int)
        i += 1
    return core_points

In [5]:
def effDBSCAN(data, epsilon, minP):
  C = 0
  point_label = np.zeros(len(data),dtype=int) 
  core_points = []
  for p in range(len(data)):
    if point_label[p] != 0: continue
    current_point = data[p]
    # armazena os pontos vizinhos ao ponto p.
    neighbor_points = [i for i in range(len(data)) if np.linalg.norm(data[i,:]-current_point) <= epsilon]
  
    # o ponto nao tem vizinhos o suficiente
    if len(neighbor_points) <= minP:
      point_label[p] = -1
    # tem vizinhos o suficiente para criar um novo clusters
    else:
      C += 1
      core_points.append(p) # = np.append(core_points, p)
      core_points = ExpandClusters(data, point_label, p, neighbor_points, C, epsilon, minP, core_points)

  return point_label, core_points, C


In [6]:
#Function to plot final result
def plotRes(train, point_labels, main_title):
  dicColors = {0:'black', 1:'orange', 2:'purple', 3:'red', 4:'blue', 5:'green', 
              6:'yellow', 7:'violet', 8:'brown'}    
  label_color = [dicColors[c%9] for c in point_labels] 
  x_label = 'x axis'
  y_label = 'y axis'
  title = main_title 
  plt.figure(figsize=(15,15))
  plt.scatter(train.iloc[:,0],train.iloc[:,1],c=label_color,alpha=0.3)
  plt.suptitle(title, fontsize=20)
  plt.suptitle(title, fontsize=20)  
  plt.suptitle(title, fontsize=20)  
  plt.savefig(title + '.png')
  plt.show()

In [7]:
def callPlot(train_data,pointlabel, main_title):
  cl = len(pointlabel)
  plotRes(train_data, pointlabel, main_title) 
  plt.show()
  print('number of cluster found: ' + str(cl-1))
  counter=collections.Counter(pointlabel)
  print(counter)
  outliers  = np.count_nonzero(pointlabel == 0)
  print('number of outliers found: '+str(outliers) +'\n')

In [8]:
# initialize complete DBSCAN
print('Set epsilon (normalized radius) = ' +str(eps)+ ', Min Points = '+str(minpts))
start_time = time.time()
norm_train = normalize(train).to_numpy()
point_labels, core_points, n_clusters = effDBSCAN(norm_train,eps,minpts)
print("--- %s seconds ---" % (time.time() - start_time))

Set epsilon (normalized radius) = 0.005, Min Points = 10
--- 1.9561505317687988 seconds ---


In [13]:

# Split and initialize training & validation & test data separately
norm_training, norm_test = train_test_split(norm_train, test_size=0.1, random_state=22)
# training and validation are alreapy in numpy.
norm_training, norm_validation = train_test_split(norm_train, test_size=0.1, random_state=10)
point_labels_train, core_points_train, n_clusters_train = effDBSCAN(norm_training,eps,minpts)

In [79]:
# callPlot(train,point_labels,
#      'TRIAL Clusters division applying method DBSCAN (epsilon ' +str(eps)+ ') - data from cluster.dat')
callPlot(norm_training,point_labels_train,
     'Training Clusters division applying method DBSCAN (epsilon ' +str(eps)+ ') - data from cluster.dat')

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [24]:
# checar depois: https://gist.github.com/AlexandreAbraham/5544803
def find_clusters_and_centroids(norm_dataset, point_labels, n_clusters):
    clusters = [[]]*(n_clusters+1)
    for i in range(0,len(norm_dataset),50):    
        # clusters[point_labels[0][i]] = np.append(clusters[point_labels[0][i]], i)
        clusters[point_labels[i]] = np.append(clusters[point_labels[i]], i)
    centroids = [[np.mean(norm_dataset[np.asarray(i).astype(int)], axis=0)] for i in clusters]
    return clusters, centroids

In [44]:
def dist_intra_clusters_index (norm_data, clusters, centroids, i):
  B = [[abs(norm_data[round(clusters[i][j])] - centroids[i][0]) 
          for j in range(len(clusters[i]))]
           ] # for i in range(len(clusters))]
  return B

# search for math.dist(p, q) later :)
def dist_intra_clusters(norm_data, clusters, centroids):

    centroids =np.asarray(centroids)
    B = []
    soma = 0
    for i in range(len(clusters)):
        for j in range(len(clusters[i])):
            if not math.isnan(clusters[i][j]) and not math.isnan(centroids[i][0][0]): 
                soma += np.abs(norm_data[round(clusters[i][j])] - centroids[i][0])
    B = soma  
    return B
  

In [22]:
def dist_centroids(centroids, c1, c2):
    return abs(centroids[c1][0] - centroids[c2][0])

def dist_inter_clusters(centroids):
    A = []
    centroids =np.asarray(centroids)
    soma = 0 
    for i in range(len(centroids)):
        for j in range(i+1, len(centroids)):
            if not math.isnan(centroids[i][0][0]) and not math.isnan(centroids[j][0][0]): 
                soma += np.abs(centroids[i][0] - centroids[j][0])
    A = soma
    return A 

In [26]:
clusters, centroids = find_clusters_and_centroids (norm_training, point_labels_train, n_clusters_train)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [60]:
def distance(V):
    return math.sqrt(sum(v**2 for v in V))

def calc_silhouette (norm_dataset, clusters, centroids):
    a = distance (dist_inter_clusters(centroids))
    b = distance (dist_intra_clusters(norm_dataset, clusters, centroids))
    return (b - a)/max(a, b)
    

In [64]:
def inertia(V):
    return sum(v**2 for v in V)
def calc_elbow(norm_dataset, clusters, centroids):
    return inertia(dist_intra_clusters(norm_dataset, clusters, centroids))

In [75]:
def elbow(data,epsilon=[0.0005, 0.001, 0.003, 0.005,0.007,0.01, 0.03, 0.05, 0.07],minpts=5):
  somaDosQuadrados = []
  for eps in epsilon:
    point_labels_train, core_points_train, n_clusters_train = effDBSCAN(norm_training,eps,minpts)
    clusters_train, centroids_train = find_clusters_and_centroids(norm_training, point_labels_train, n_clusters_train)
    somaDosQuadrados.append(calc_elbow(norm_training, clusters_train, centroids_train))
    callPlot(training,point_labels_train,
            'Training Clusters division applying method DBSCAN (epsilon ' +str(eps)+ ') - data from cluster.dat')
  plt.plot(epsilon, somaDosQuadrados, 'x-')
  plt.xlabel('epsilon')
  plt.ylabel('Soma dos quadrados')
  plt.title('Elbow')
  plt.show()

In [76]:
elbow(norm_training)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [50]:
def predict (core_points, norm_original_data, point_labels, norm_test_data):
    prediction_labels = [0]*len(norm_test_data)
    # print(prediction_labels)
    for p in range(len(norm_test_data)):
        for cp in core_points:
            if np.linalg.norm(norm_test_data[p,:] - norm_original_data.iloc[round(cp),:]):
                prediction_labels[p] = point_labels[cp]
    # prediction = [point_labels[cp]]
    return prediction_labels