-
Notifications
You must be signed in to change notification settings - Fork 1
/
clusts.py
170 lines (126 loc) · 6.96 KB
/
clusts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import numpy as np
import pandas as pd
import scipy,sys,os,pdb
from sklearn import metrics
from multiprocessing import current_process
from functools import partial
from numpy import linalg as LA
from sklearn import mixture
from scipy.spatial.distance import cdist
from scipy.spatial.distance import correlation as cor
from scipy.optimize import linear_sum_assignment
from scipy.optimize import nnls
from numpy.random import SeedSequence
from numpy.random import Generator, PCG64DXSM, SeedSequence
import multiprocessing
class ForkedPdb(pdb.Pdb):
"""A Pdb subclass that may be used
from a forked multiprocessing child
"""
def interaction(self, *args, **kwargs):
_stdin = sys.stdin
try:
sys.stdin = open('/dev/stdin')
pdb.Pdb.interaction(self, *args, **kwargs)
finally:
sys.stdin = _stdin
def cluster_converge_outerloop(Wall, Hall, totalprocess, dist="cosine",
gpu=False, cluster_rand_seq=None, n_cpu=-1):
avgSilhouetteCoefficients = -1 # intial avgSilhouetteCoefficients
#do the parallel clustering
result_list = parallel_clustering(Wall, Hall, totalprocess, iterations=50,
n_cpu=n_cpu, dist=dist, gpu=gpu,
cluster_rand_seq=cluster_rand_seq)
for i in range(50): # using 10 iterations to get the best clustering
temp_processAvg, temp_exposureAvg, temp_processSTE, temp_exposureSTE, temp_avgSilhouetteCoefficients, temp_clusterSilhouetteCoefficients = result_list[i][0], result_list[i][1], result_list[i][2], result_list[i][3], result_list[i][4], result_list[i][5]
if avgSilhouetteCoefficients < temp_avgSilhouetteCoefficients:
processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients = temp_processAvg, temp_exposureAvg, temp_processSTE, temp_exposureSTE, temp_avgSilhouetteCoefficients, temp_clusterSilhouetteCoefficients
return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients
def parallel_clustering(Wall, Hall, totalProcesses, iterations=50, n_cpu=-1, dist= "cosine", gpu=False, cluster_rand_seq=None):
if n_cpu==-1:
pool = multiprocessing.Pool()
else:
pool = multiprocessing.Pool(processes=n_cpu)
# create random generators for each subprocess
sub_rand_generator = cluster_rand_seq.spawn(iterations)
iteration_generator_pairs = []
# pair generator with an interation
for i,j in zip(range(iterations), sub_rand_generator):
iteration_generator_pairs.append([i,j])
# import pdb; pdb.set_trace()
pool_nmf=partial(cluster_converge_innerloop, Wall, Hall, totalProcesses, dist=dist, gpu=gpu)
result_list = pool.map(pool_nmf, iteration_generator_pairs)
pool.close()
pool.join()
return result_list
def cluster_converge_innerloop(Wall, Hall, totalprocess, iteration_generator_pair, iteration=1, dist="cosine", gpu=False):
rng_generator = Generator(PCG64DXSM(iteration_generator_pair[1]))
processAvg = rng_generator.random((Wall.shape[0],totalprocess))
exposureAvg = rng_generator.random((totalprocess, Hall.shape[1]))
result = 0
convergence_count = 0
while True:
processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients = reclustering(Wall, Hall, processAvg, exposureAvg, dist=dist, gpu=gpu)
if result == avgSilhouetteCoefficients:
break
elif convergence_count == 10:
break
else:
result = avgSilhouetteCoefficients
convergence_count = convergence_count + 1
return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients
def reclustering(tempWall=0, tempHall=0, processAvg=0, exposureAvg=0, dist="cosine",gpu=False):
# exposureAvg is not important here. It can be any matrix with the same size of a single exposure matrix
iterations = int(tempWall.shape[1]/processAvg.shape[1])
processes = processAvg.shape[1]
idxIter = list(range(0, tempWall.shape[1], processes))
processes3D = np.zeros([processAvg.shape[1], processAvg.shape[0], iterations])
exposure3D = np.zeros([exposureAvg.shape[0], iterations, exposureAvg.shape[1]])
for iteration_number in range(len(idxIter)):
statidx = idxIter[iteration_number]
loopidx = list(range(statidx, statidx+processes))
idxPair= pairwise_cluster_raw(mat1=processAvg, mat2=tempWall[:, loopidx], dist=dist)
# from IPython.core.debugger import Pdb; Pdb().set_trace()
for cluster_items in idxPair:
cluster_number = cluster_items[0]
query_idx = cluster_items[1]
processes3D[cluster_number,:,iteration_number]=tempWall[:,statidx+query_idx]
exposure3D[cluster_number, iteration_number, :] = tempHall[statidx+query_idx,:]
count = 0
labels=[]
clusters = pd.DataFrame()
for cluster_id in range(processes3D.shape[0]):
cluster_vectors = pd.DataFrame(processes3D[cluster_id,:,:])
clusters = pd.concat([clusters,cluster_vectors.T])
# clusters = clusters.append(cluster_vectors.T)
for k in range(cluster_vectors.shape[1]):
labels.append(count)
count= count+1
try:
if dist=="cosine":
SilhouetteCoefficients = metrics.silhouette_samples(clusters, labels, metric='cosine')
if dist=="correlation":
SilhouetteCoefficients = metrics.silhouette_samples(clusters, labels, metric='correlation')
except:
SilhouetteCoefficients = np.ones((len(labels),1))
avgSilhouetteCoefficients = np.mean(SilhouetteCoefficients)
#clusterSilhouetteCoefficients
splitByCluster = np.array_split(SilhouetteCoefficients, processes3D.shape[0])
clusterSilhouetteCoefficients = np.array([])
for i in splitByCluster:
clusterSilhouetteCoefficients=np.append(clusterSilhouetteCoefficients, np.mean(i))
processAvg = np.mean(processes3D, axis=2).T
processSTE = scipy.stats.sem(processes3D, axis=2, ddof=1).T
exposureAvg = np.mean(exposure3D, axis=1)
exposureSTE = scipy.stats.sem(exposure3D, axis=1, ddof=1)
return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients
def pairwise_cluster_raw(mat1=([0]), mat2=([0]), dist="cosine"): # the matrices (mat1 and mat2) are used to calculate the clusters and the lsts will be used to store the members of clusters
if dist=="cosine":
con_mat = cdist(mat1.T, mat2.T, "cosine")
elif dist=="correlation":
con_mat = cdist(mat1.T, mat2.T, "correlation")
row_ind, col_ind = linear_sum_assignment(con_mat)
idxPair=[]
for i, j in zip(row_ind, col_ind):
idxPair.append([i,j])
return idxPair