forked from jeremander/Gplus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
factor_attr_mat.py
executable file
·197 lines (184 loc) · 11.1 KB
/
factor_attr_mat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""Computes eigenvalues and eigenvectors of the PMI similarity matrices for a given attribute type. Saves the results of this along with kMeans clustering of the attributes, and the assignment of graph nodes to clusters."""
import pickle
import time
import numpy as np
import pandas as pd
import optparse
from scipy.sparse import coo_matrix, diags
from sklearn.cluster import KMeans
from gplus import *
def generate_cluster_report(attr_analyzer, attr_type, cluster_labels, topN = 30):
"""Given the AttributeAnalyzer, attr_type, and a list of cluster labels (corresponding to the attribute vocab indices only), generates a report listing the top N members of each cluster, and the frequency and prevalence (relative frequency) of each attribute in the data set. Orders the clusters by total occurrences of attributes in each cluster. If topN = None, list all the attributes in each cluster."""
attr_freq_dict = attr_analyzer.attr_freqs_by_type[attr_type]
total_attr_freqs = sum(attr_freq_dict.values())
pfa = attr_analyzer.pairwise_freq_analyzers[attr_type]
attr_indices, attr_vocab = get_attr_indices(pfa, attr_analyzer.attributed_nodes)
unique_cluster_labels = set(cluster_labels)
# compute vocab lists for each cluster
attr_vocab_by_cluster = dict((lab, []) for lab in unique_cluster_labels)
for (i, lab) in enumerate(cluster_labels):
v = attr_vocab[i]
if v.startswith('*???*'):
continue
freq = attr_freq_dict[v]
attr_vocab_by_cluster[lab].append((v, freq, freq / total_attr_freqs))
# sort vocab lists by decreasing frequencies
for lab in unique_cluster_labels:
attr_vocab_by_cluster[lab].sort(key = lambda item : item[1], reverse = True)
# total number of occurrences of any attribute in each cluster
total_freqs_by_cluster = dict((lab, sum([item[1] for item in attr_vocab_by_cluster[lab]])) for lab in unique_cluster_labels)
info_by_cluster = dict((lab, dict()) for lab in unique_cluster_labels)
# create a DataFrame for each cluster listing the top N vocab items in order with their frequencies and prevalences
for lab in unique_cluster_labels:
df = pd.DataFrame(attr_vocab_by_cluster[lab], columns = ['attribute', 'frequency', 'prevalence'])
info_by_cluster[lab]['df'] = df if (topN is None) else df[:topN]
info_by_cluster[lab]['size'] = len(attr_vocab_by_cluster[lab])
info_by_cluster[lab]['totalFreq'] = total_freqs_by_cluster[lab]
info_by_cluster[lab]['totalPrevalence'] = sum(df['prevalence'])
# sort clusters by decreasing number of occurrences
sorted_clusters_with_total_freqs = sorted(total_freqs_by_cluster.items(), key = lambda item : item[1], reverse = True)
# generate report
num_attrs = len(attr_vocab)
s = ''
for (lab, freq) in sorted_clusters_with_total_freqs:
info = info_by_cluster[lab]
width = 12 + len(str(lab))
s += '#' * width + '\n'
s += '# ' + 'CLUSTER ' + str(lab) + ' #\n'
s += '#' * width + '\n\n'
s += 'attribute prevalence = %6d / %6d = %f\n' % (info['size'], num_attrs, info['size'] / num_attrs)
s += 'occurrence prevalence = %6d / %6d = %f\n\n' % (info['totalFreq'], total_attr_freqs, info['totalPrevalence'])
s += info['df'].to_string(index = False) + '\n\n\n'
return s
# save off:
# matrix or LinearOperator for similarity matrix
# eigenvalues and scree plot
# embedded vectors corresponding to attributes
# kmeans clusters corresponding to attributes
# report of top clusters
# mappings from nodes to clusters of the given attribute type
def main():
p = optparse.OptionParser()
p.add_option('--attr_type', '-a', type = str, help = 'attribute type')
p.add_option('-p', type = str, help = 'PMI type (PMIs, NPMI1s, or NPMI2s)')
p.add_option('-e', type = str, help = 'embedding (adj, normlap, regnormlap)')
p.add_option('-s', action = 'store_true', default = False, help = 'normalize in sphere')
p.add_option('-d', type = float, help = 'smoothing parameter')
p.add_option('-k', type = int, help = 'number of eigenvalues')
p.add_option('-c', type = int, help = 'number of kmeans clusters')
p.add_option('-t', type = float, default = None, help = 'tolerance for eigsh')
p.add_option('-v', action = 'store_true', default = False, help = 'save scree plot')
opts, args = p.parse_args()
attr_type = opts.attr_type
sim = opts.p
embedding = opts.e
assert (embedding in ['adj', 'normlap', 'regnormlap'])
sphere = opts.s
delta = opts.d
k = opts.k
nclusts = opts.c
tol = opts.t
save_plot = opts.v
topN = 50 # for the report
assert (((sim == 'PMIs') or (delta == 0)) and (sim in ['PMIs', 'NPMI1s', 'NPMI2s']))
data_folder = 'gplus0_lcc/data/PMI/'
report_folder = 'gplus0_lcc/reports/PMI/'
plot_folder = 'gplus0_lcc/plots/PMI/'
file_prefix1 = ('%s_%s_%s_delta' % (attr_type, sim, embedding)) + str(delta) + ('_k%d' % k)
file_prefix2 = ('%s_%s_%s_delta' % (attr_type, sim, embedding)) + str(delta) + ('_k%d%s_c%d' % (k, '_normalized' if sphere else '', nclusts))
print_flush("\nLoading AttributeAnalyzer...")
a = AttributeAnalyzer()
a.load_pairwise_freq_analyzer(attr_type)
a.make_attrs_by_node_by_type()
attrs_by_node = a.attrs_by_node_by_type[attr_type]
pfa = a.pairwise_freq_analyzers[attr_type]
n = pfa.num_vocab
tol = (1.0 / n) if (tol is None) else tol # use 1/n instead of machine precision as default tolerance
attr_indices, attr_vocab = get_attr_indices(pfa, a.attributed_nodes)
try:
print_flush("\nLoading labels from '%s%s_labels.csv'..." % (data_folder, file_prefix2))
labels = np.loadtxt('%s%s_labels.csv' % (data_folder, file_prefix2), dtype = int)
print_flush("\nLoading cluster centers from '%s%s_cluster_centers.csv'..." % (data_folder, file_prefix2))
cluster_centers = np.loadtxt('%s%s_cluster_centers.csv' % (data_folder, file_prefix2), delimiter = ',')
print_flush("\nLoading eigenvalues from '%s%s_eigvals.csv'..." % (data_folder, file_prefix1))
eigvals = np.loadtxt('%s%s_eigvals.csv' % (data_folder, file_prefix1), delimiter = ',')
print_flush("\nLoading embedded features from '%s%s_features.pickle'..." % (data_folder, file_prefix1))
features = pickle.load(open('%s%s_features.pickle' % (data_folder, file_prefix1), 'rb'))
if sphere:
for i in range(len(attr_indices)):
features[i] = normalize(features[i])
except FileNotFoundError:
print_flush("Failed to load.")
try:
print_flush("\nLoading eigenvalues from '%s%s_eigvals.csv'..." % (data_folder, file_prefix1))
eigvals = np.loadtxt('%s%s_eigvals.csv' % (data_folder, file_prefix1), delimiter = ',')
print_flush("\nLoading embedded features from '%s%s_features.pickle'..." % (data_folder, file_prefix1))
features = pickle.load(open('%s%s_features.pickle' % (data_folder, file_prefix1), 'rb'))
except FileNotFoundError:
print_flush("Failed to load.")
print_flush("\nComputing similarity matrix (%s)..." % sim)
sim_op = pfa.to_sparse_PMI_operator(sim, delta)
matrix_type = 'adjacency' if (embedding == 'adj') else ('normalized Laplacian' if (embedding == 'normlap') else 'regularized normalized Laplacian')
print_flush("\nComputing eigenvectors of %s matrix (k = %d)..." % (matrix_type, k))
if (embedding == 'adj'):
(eigvals, features) = timeit(eigsh)(sim_op, k = k, tol = tol)
features = np.sqrt(np.abs(eigvals)) * features # scale the feature columns by the sqrt of the eigenvalues
elif (embedding == 'normlap'):
normlap = SparseNormalizedLaplacian(sim_op)
(eigvals, features) = timeit(eigsh)(normlap, k = k, tol = tol)
elif (embedding == 'regnormlap'):
regnormlap = SparseRegularizedNormalizedLaplacian(sim_op)
(eigvals, features) = timeit(eigsh)(regnormlap, k = k, tol = tol)
features = features[attr_indices, :] # free up memory by deleting embeddings of nodes with no attributes
np.savetxt('%s%s_eigvals.csv' % (data_folder, file_prefix1), eigvals, delimiter = ',')
pickle.dump(features, open('%s%s_features.pickle' % (data_folder, file_prefix1), 'wb'))
if sphere: # normalize the features to have unit norm (better for kMeans)
for i in range(len(attr_indices)):
features[i] = normalize(features[i])
km = KMeans(nclusts)
print_flush("\nClustering attribute feature vectors into %d clusters using kMeans..." % nclusts)
labels = timeit(km.fit_predict)(features)
# save the cluster labels
np.savetxt('%s%s_labels.csv' % (data_folder, file_prefix2), np.array(labels, dtype = int), delimiter = ',', fmt = '%d')
# save the cluster centers
cluster_centers = km.cluster_centers_
np.savetxt('%s%s_cluster_centers.csv' % (data_folder, file_prefix2), cluster_centers, delimiter = ',')
# save the attribute cluster report
with open('%s%s_cluster_report.txt' % (report_folder, file_prefix2), 'w') as f:
f.write(generate_cluster_report(a, attr_type, labels, topN))
if save_plot:
print_flush("\nSaving scree plot to '%s%s_screeplot.png'..." % (plot_folder, file_prefix1))
scree_plot(eigvals, show = False, filename = '%s%s_screeplot.png' % (plot_folder, file_prefix1))
print_flush("\nAssigning cluster labels to each node...")
indices_by_vocab = dict((v, i) for (i, v) in enumerate(attr_vocab))
centers = [normalize(center) for center in cluster_centers] if sphere else cluster_centers
def assign_cluster(node):
"""Assigns -1 to a node with no attribute present. Otherwise, takes the cluster whose center is closest to the mean of the attribute vectors. Uses cosine distance if sphere = True, otherwise Euclidean distance."""
if (node not in attrs_by_node):
return -1
else:
attrs = list(attrs_by_node[node])
if (len(attrs) == 1):
return labels[indices_by_vocab[attrs[0]]]
else:
vec = np.zeros(k, dtype = float)
for attr in attrs:
vec += features[indices_by_vocab[attr]]
vec /= len(attrs)
if sphere:
vec = normalize(vec)
sims = [np.dot(vec, center) for center in centers]
else:
sims = [-np.linalg.norm(vec - center) for center in centers]
max_index, max_sim = -1, -float('inf')
for (i, sim) in enumerate(sims):
if (sim > max_sim):
max_index = i
max_sim = sim
return max_index
# save file with the list of cluster labels for each node
clusters_by_node = [assign_cluster(i) for i in range(a.num_vertices)]
np.savetxt('%s%s_node_labels.csv' % (data_folder, file_prefix2), np.array(clusters_by_node, dtype = int), delimiter = ',', fmt = '%d')
print_flush("\nDone!")
if __name__ == "__main__":
main()