/
dbscanpp.pyx
183 lines (145 loc) · 6.01 KB
/
dbscanpp.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import numpy as np
cimport numpy as np
import random
import math
from sklearn.neighbors import KDTree
from datetime import datetime
cdef extern from "k_centers.h":
void k_centers_cy(int m, int n, int d,
double * X,
double * closest_dist_sq,
int * result)
cdef k_centers_np(m, n, d,
np.ndarray[double, ndim=2, mode="c"] X,
np.ndarray[double, ndim=1, mode="c"] closest_dist_sq,
np.ndarray[np.int32_t, ndim=1, mode="c"] result):
k_centers_cy(m, n, d,
<double *> np.PyArray_DATA(X),
<double *> np.PyArray_DATA(closest_dist_sq),
<int *> np.PyArray_DATA(result))
cdef extern from "DBSCAN.h":
void DBSCAN_cy(int c, int n,
int * X_core,
int * neighbors,
int * num_neighbors,
int * result)
cdef DBSCAN_np(c, n,
np.ndarray[np.int32_t, ndim=1, mode="c"] X_core,
np.ndarray[np.int32_t, ndim=1, mode="c"] neighbors,
np.ndarray[np.int32_t, ndim=1, mode="c"] num_neighbors,
np.ndarray[np.int32_t, ndim=1, mode="c"] result):
DBSCAN_cy(c, n,
<int *> np.PyArray_DATA(X_core),
<int *> np.PyArray_DATA(neighbors),
<int *> np.PyArray_DATA(num_neighbors),
<int *> np.PyArray_DATA(result))
cdef extern from "cluster_remaining.h":
void cluster_remaining_cy(int n,
int * closest_point,
int * result)
cdef cluster_remaining_np(n,
np.ndarray[int, ndim=1, mode="c"] closest_point,
np.ndarray[np.int32_t, ndim=1, mode="c"] result):
cluster_remaining_cy(n,
<int *> np.PyArray_DATA(closest_point),
<int *> np.PyArray_DATA(result))
class DBSCANPP:
"""
Parameters
----------
p: The sample fraction, which determines m, the number of points to sample
eps_density: Radius for determining core points; points that have greater than
minPts in its eps_density-radii ball will be considered core points
eps_clustering: Radius for determining neighbors and edges in the density graph
minPts: Number of neighbors required for a point to be labeled a core point. Works
in conjunction with eps_density
"""
def __init__(self, p, eps_density,
eps_clustering, minPts):
self.p = p
self.eps_density = eps_density
self.eps_clustering = eps_clustering
self.minPts = minPts
def k_centers(self, m, n, d, X):
"""
Return m points from X that are far away from each other
Parameters
----------
m: Number of points to sample
n: Size of original dataset
d: Dimensions of original dataset
X: (m, d) dataset
Returns
----------
(m, ) list of indices
"""
n, d = X.shape
indices = np.empty(m, dtype=np.int32)
closest_dist_sq = np.empty(n, dtype=np.float64)
k_centers_np(m, n, d,
X,
closest_dist_sq,
indices)
return indices
def fit_predict(self, X, init="k-centers", cluster_outliers=True):
"""
Determines the clusters in three steps.
First step is to sample points from X using either the
k-centers greedy sampling technique or a uniform
sample technique. The next step is to run DBSCAN on the
sampled points using the k-NN densities. Finally, all the
remaining points are clustered to the closest cluster.
Parameters
----------
X: Data matrix. Each row should represent a datapoint in
Euclidean space
init: String. Either "k-centers" for the K-center greedy
sampling technique or "uniform" for a uniform random
sampling technique
cluster_outliers: Boolean. Whether we should cluster the
remaining points
Returns
----------
(n, ) cluster labels
"""
X = np.ascontiguousarray(X)
n, d = X.shape
m = int(self.p * math.pow(n, d/(d + 4.0)))
# Find a random subset of m points
if init == "uniform":
X_sampled_ind = np.random.choice(np.arange(n, dtype=np.int32), m, replace=False)
X_sampled_ind = np.sort(X_sampled_ind)
elif init == "k-centers":
X_sampled_ind = self.k_centers(m, n, d, X)
else:
raise ValueError("Initialization technique %s is not defined." % init)
X_sampled_pts = X[X_sampled_ind]
# Find the core points
X_tree = KDTree(X)
radii, indices = X_tree.query(X_sampled_pts, k=self.minPts)
X_core_ind = X_sampled_ind[radii[:,-1] <= self.eps_density]
X_core_pts = X[X_core_ind]
# Get the list of core neighbors for each core point
core_pts_tree = KDTree(X_core_pts)
neighbors = core_pts_tree.query_radius(X_core_pts, self.eps_clustering)
neighbors = np.asarray(np.concatenate(neighbors), dtype=np.int32)
neighbors_ct = core_pts_tree.query_radius(X_core_pts, self.eps_clustering, count_only=True)
num_neighbors = np.cumsum(neighbors_ct, dtype=np.int32)
# Cluster the core points
c = X_core_ind.shape[0]
result = np.full(n, -1, dtype=np.int32)
DBSCAN_np(c,
n,
X_core_ind,
neighbors,
num_neighbors,
result)
# Find the closest core point to every data point
dist_to_core_pt, closest_core_pt = core_pts_tree.query(X, k=1)
closest = X_core_ind[closest_core_pt[:,0]]
# Cluster the remaining points
cluster_remaining_np(n, closest, result)
# Set outliers
if not cluster_outliers:
result[dist_to_core_pt[:,0] > self.eps_density] = -1
return result