-
Notifications
You must be signed in to change notification settings - Fork 93
/
a1.py
394 lines (331 loc) · 14 KB
/
a1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
# coding: utf-8
# # CS579: Assignment 1
#
# In this assignment, we'll implement community detection algorithms using Facebook "like" data.
#
# The file `edges.txt.gz` indicates like relationships between facebook users. This was collected using snowball sampling: beginning with the user "Bill Gates", I crawled all the people he "likes", then, for each newly discovered user, I crawled all the people they liked.
#
# We'll cluster the resulting graph into communities.
#
# Complete the methods below that are indicated by `TODO`. I've provided some sample output to help guide your implementation.
# You should not use any imports not listed here:
from collections import Counter, defaultdict, deque
import copy
from itertools import combinations
import math
import networkx as nx
from numpy.linalg import eigh
import numpy as np
import urllib.request
## Community Detection
def example_graph():
"""
Create the example graph from class. Used for testing.
Do not modify.
"""
g = nx.Graph()
g.add_edges_from([('A', 'B'), ('A', 'C'), ('B', 'C'), ('B', 'D'), ('D', 'E'), ('D', 'F'), ('D', 'G'), ('E', 'F'), ('G', 'F')])
return g
def bfs(graph, root, max_depth):
"""
Perform breadth-first search to compute the shortest paths from a root node to all
other nodes in the graph. To reduce running time, the max_depth parameter ends
the search after the specified depth.
E.g., if max_depth=2, only paths of length 2 or less will be considered.
This means that nodes greather than max_depth distance from the root will not
appear in the result.
You may use these two classes to help with this implementation:
https://docs.python.org/3.5/library/collections.html#collections.defaultdict
https://docs.python.org/3.5/library/collections.html#collections.deque
Params:
graph.......A networkx Graph
root........The root node in the search graph (a string). We are computing
shortest paths from this node to all others.
max_depth...An integer representing the maximum depth to search.
Returns:
node2distances...dict from each node to the length of the shortest path from
the root node
node2num_paths...dict from each node to the number of shortest paths from the
root node to this node.
node2parents.....dict from each node to the list of its parents in the search
tree
In the doctests below, we first try with max_depth=5, then max_depth=2.
>>> node2distances, node2num_paths, node2parents = bfs(example_graph(), 'E', 5)
>>> sorted(node2distances.items())
[('A', 3), ('B', 2), ('C', 3), ('D', 1), ('E', 0), ('F', 1), ('G', 2)]
>>> sorted(node2num_paths.items())
[('A', 1), ('B', 1), ('C', 1), ('D', 1), ('E', 1), ('F', 1), ('G', 2)]
>>> sorted((node, sorted(parents)) for node, parents in node2parents.items())
[('A', ['B']), ('B', ['D']), ('C', ['B']), ('D', ['E']), ('F', ['E']), ('G', ['D', 'F'])]
>>> node2distances, node2num_paths, node2parents = bfs(example_graph(), 'E', 2)
>>> sorted(node2distances.items())
[('B', 2), ('D', 1), ('E', 0), ('F', 1), ('G', 2)]
>>> sorted(node2num_paths.items())
[('B', 1), ('D', 1), ('E', 1), ('F', 1), ('G', 2)]
>>> sorted((node, sorted(parents)) for node, parents in node2parents.items())
[('B', ['D']), ('D', ['E']), ('F', ['E']), ('G', ['D', 'F'])]
"""
###TODO
pass
def complexity_of_bfs(V, E, K):
"""
If V is the number of vertices in a graph, E is the number of
edges, and K is the max_depth of our approximate breadth-first
search algorithm, then what is the *worst-case* run-time of
this algorithm? As usual in complexity analysis, you can ignore
any constant factors. E.g., if you think the answer is 2V * E + 3log(K),
you would return V * E + math.log(K)
>>> v = complexity_of_bfs(13, 23, 7)
>>> type(v) == int or type(v) == float
True
"""
###TODO
pass
def bottom_up(root, node2distances, node2num_paths, node2parents):
"""
Compute the final step of the Girvan-Newman algorithm.
See p 352 From your text:
https://github.com/iit-cs579/main/blob/master/read/lru-10.pdf
The third and final step is to calculate for each edge e the sum
over all nodes Y of the fraction of shortest paths from the root
X to Y that go through e. This calculation involves computing this
sum for both nodes and edges, from the bottom. Each node other
than the root is given a credit of 1, representing the shortest
path to that node. This credit may be divided among nodes and
edges above, since there could be several different shortest paths
to the node. The rules for the calculation are as follows: ...
Params:
root.............The root node in the search graph (a string). We are computing
shortest paths from this node to all others.
node2distances...dict from each node to the length of the shortest path from
the root node
node2num_paths...dict from each node to the number of shortest paths from the
root node that pass through this node.
node2parents.....dict from each node to the list of its parents in the search
tree
Returns:
A dict mapping edges to credit value. Each key is a tuple of two strings
representing an edge (e.g., ('A', 'B')). Make sure each of these tuples
are sorted alphabetically (so, it's ('A', 'B'), not ('B', 'A')).
Any edges excluded from the results in bfs should also be exluded here.
>>> node2distances, node2num_paths, node2parents = bfs(example_graph(), 'E', 5)
>>> result = bottom_up('E', node2distances, node2num_paths, node2parents)
>>> sorted(result.items())
[(('A', 'B'), 1.0), (('B', 'C'), 1.0), (('B', 'D'), 3.0), (('D', 'E'), 4.5), (('D', 'G'), 0.5), (('E', 'F'), 1.5), (('F', 'G'), 0.5)]
"""
###TODO
pass
def approximate_betweenness(graph, max_depth):
"""
Compute the approximate betweenness of each edge, using max_depth to reduce
computation time in breadth-first search.
You should call the bfs and bottom_up functions defined above for each node
in the graph, and sum together the results. Be sure to divide by 2 at the
end to get the final betweenness.
Params:
graph.......A networkx Graph
max_depth...An integer representing the maximum depth to search.
Returns:
A dict mapping edges to betweenness. Each key is a tuple of two strings
representing an edge (e.g., ('A', 'B')). Make sure each of these tuples
are sorted alphabetically (so, it's ('A', 'B'), not ('B', 'A')).
>>> sorted(approximate_betweenness(example_graph(), 2).items())
[(('A', 'B'), 2.0), (('A', 'C'), 1.0), (('B', 'C'), 2.0), (('B', 'D'), 6.0), (('D', 'E'), 2.5), (('D', 'F'), 2.0), (('D', 'G'), 2.5), (('E', 'F'), 1.5), (('F', 'G'), 1.5)]
"""
###TODO
pass
def get_components(graph):
"""
A helper function you may use below.
Returns the list of all connected components in the given graph.
"""
return [graph.subgraph(c).copy() for c in nx.connected_components(graph)]
def partition_girvan_newman(graph, max_depth):
"""
Use your approximate_betweenness implementation to partition a graph.
Unlike in class, here you will not implement this recursively. Instead,
just remove edges until more than one component is created, then return
those components.
That is, compute the approximate betweenness of all edges, and remove
them until multiple components are created.
You only need to compute the betweenness once.
If there are ties in edge betweenness, break by edge name (e.g.,
(('A', 'B'), 1.0) comes before (('B', 'C'), 1.0)).
Note: the original graph variable should not be modified. Instead,
make a copy of the original graph prior to removing edges.
See the Graph.copy method https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.Graph.copy.html
Params:
graph.......A networkx Graph
max_depth...An integer representing the maximum depth to search.
Returns:
A list of networkx Graph objects, one per partition.
>>> components = partition_girvan_newman(example_graph(), 5)
>>> components = sorted(components, key=lambda x: sorted(x.nodes())[0])
>>> sorted(components[0].nodes())
['A', 'B', 'C']
>>> sorted(components[1].nodes())
['D', 'E', 'F', 'G']
"""
###TODO
pass
def get_subgraph(graph, min_degree):
"""Return a subgraph containing nodes whose degree is
greater than or equal to min_degree.
We'll use this in the main method to prune the original graph.
Params:
graph........a networkx graph
min_degree...degree threshold
Returns:
a networkx graph, filtered as defined above.
>>> subgraph = get_subgraph(example_graph(), 3)
>>> sorted(subgraph.nodes())
['B', 'D', 'F']
>>> len(subgraph.edges())
2
"""
###TODO
pass
""""
Compute the normalized cut for each discovered cluster.
I've broken this down into the three next methods.
"""
def volume(nodes, graph):
"""
Compute the volume for a list of nodes, which
is the number of edges in `graph` with at least one end in
nodes.
Params:
nodes...a list of strings for the nodes to compute the volume of.
graph...a networkx graph
>>> volume(['A', 'B', 'C'], example_graph())
4
"""
###TODO
pass
def cut(S, T, graph):
"""
Compute the cut-set of the cut (S,T), which is
the set of edges that have one endpoint in S and
the other in T.
Params:
S.......set of nodes in first subset
T.......set of nodes in second subset
graph...networkx graph
Returns:
An int representing the cut-set.
>>> cut(['A', 'B', 'C'], ['D', 'E', 'F', 'G'], example_graph())
1
"""
###TODO
pass
def norm_cut(S, T, graph):
"""
The normalized cut value for the cut S/T. (See lec06.)
Params:
S.......set of nodes in first subset
T.......set of nodes in second subset
graph...networkx graph
Returns:
An float representing the normalized cut value
"""
###TODO
pass
def score_max_depths(graph, max_depths):
"""
In order to assess the quality of the approximate partitioning method
we've developed, we will run it with different values for max_depth
and see how it affects the norm_cut score of the resulting partitions.
Recall that smaller norm_cut scores correspond to better partitions.
Params:
graph........a networkx Graph
max_depths...a list of ints for the max_depth values to be passed
to calls to partition_girvan_newman
Returns:
A list of (int, float) tuples representing the max_depth and the
norm_cut value obtained by the partitions returned by
partition_girvan_newman. See Log.txt for an example.
"""
###TODO
pass
"""
Next, use eigenvalue decomposition to partition a graph.
"""
def get_second_eigenvector(graph):
"""
1. Create the Laplacian matrix.
2. Obtain its eigenvector matrix using the eigh function.
3. Return the second column eigenvector
Returns:
a 1d numpy array containing the second eigenvector
>>> np.round(get_second_eigenvector(example_graph()), 2)
array([ 0.49, 0.3 , 0.49, -0.21, -0.36, -0.36, -0.36])
"""
###TODO
pass
def partition_by_eigenvector(graph):
"""
Using the get_second_eigenvector function above, partition the graph into
two components using a splitting threshold of 0. That is, nodes
whose corresponding value in the second eigenvector is >= 0 are in one cluster,
and the rest are in the other cluster.
Returns:
A list of two networkx Graph objects, one per partition.
Sort these in ascending order of partition size.
>>> graph = example_graph()
>>> result = partition_by_eigenvector(graph)
>>> sorted(result[0].nodes())
['A', 'B', 'C']
>>> sorted(result[1].nodes())
['D', 'E', 'F', 'G']
>>> round(norm_cut(result[0].nodes(), result[1].nodes(), graph), 2)
0.42
"""
###TODO
pass
"""
Next, we'll download a real dataset to see how our algorithm performs.
"""
def download_data():
"""
Download the data. Done for you.
"""
urllib.request.urlretrieve('http://cs.iit.edu/~culotta/cs579/a1/edges.txt.gz', 'edges.txt.gz')
def read_graph():
""" Read 'edges.txt.gz' into a networkx **undirected** graph.
Done for you.
Returns:
A networkx undirected graph.
"""
return nx.read_edgelist('edges.txt.gz', delimiter='\t')
def main():
"""
FYI: This takes ~10-15 seconds to run on my laptop.
"""
download_data()
graph = read_graph()
print('full graph has %d nodes and %d edges' %
(graph.order(), graph.number_of_edges()))
subgraph = get_subgraph(graph, 2)
print('subgraph has %d nodes and %d edges' %
(subgraph.order(), subgraph.number_of_edges()))
print('\n\ncomputing norm_cut scores by max_depth...\nmax_depth\tnorm_cut_score')
for max_depth, score in score_max_depths(subgraph, range(1,5)):
print('%d\t\t%.3f' % (max_depth, score))
print('\n\ngetting result with max_depth=3')
clusters = partition_girvan_newman(subgraph, 3)
print('%d clusters' % len(clusters))
print('first partition: cluster 1 has %d nodes and cluster 2 has %d nodes' %
(clusters[0].order(), clusters[1].order()))
print('smaller cluster nodes:')
print(sorted(sorted(clusters, key=lambda x: x.order())[0].nodes()))
print('\n\npartitioning by eigenvector...')
clusters2 = partition_by_eigenvector(subgraph)
print('cluster 1 has %d nodes and cluster 2 has %d nodes' %
(clusters2[0].order(), clusters2[1].order()))
print('norm_cut score=%.3f' % norm_cut(clusters2[0].nodes(),
clusters2[1].nodes(),
subgraph))
print('10 nodes from smaller cluster:')
print(sorted(clusters2[0].nodes())[:10])
if __name__ == '__main__':
main()