-
Notifications
You must be signed in to change notification settings - Fork 2
/
Marvel.R
381 lines (296 loc) · 14.7 KB
/
Marvel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
library(readr) # fast reading of csv files
library(tidyverse) # tidy data analysis
library(tidygraph) # tidy graph analysis
library(ggraph) # for plotting
library(dplyr)
library(ggplot2)
library(igraph)
#Importing all csv Files from a Folder
marvel <- read_csv("D:/Courses/02_Socal network Analytics/Harsh Study Material/Final Project/marvel.csv")
marvel
marvel_nodes <- read_csv("D:/Courses/02_Socal network Analytics/Harsh Study Material/Final Project/marvel_nodes.csv")
marvel_nodes
marvel$Source <- marvel_nodes$Label[match(marvel$Source, marvel_nodes$Id)]
marvel$Target <- marvel_nodes$Label[match(marvel$Target, marvel_nodes$Id)]
marvel
marvel <- marvel[c("Source","Target","Type","Id","Weight")]
marvel
marvel_ch <- marvel %>%
select(-Type) %>%
gather(x, name, Source:Target) %>%
group_by(name) %>%
summarise(sum_weight = sum(Weight)) %>%
ungroup()
marvel_ch
plot.igraph(graph.data.frame(marvel))
marvel_ch_l <- marvel_ch %>%arrange(desc(sum_weight)) %>%top_n(50, sum_weight)
marvel_ch_l
plot.igraph(graph.data.frame(marvel_ch_l,directed = FALSE))
# creating conversation table between top 100 characters
marvel_conversation <- marvel %>%filter(Source %in% marvel_ch_l$name & Target %in% marvel_ch_l$name)
marvel_conversation
gm <- graph.data.frame(marvel_conversation,directed = FALSE)
plot.igraph(gm)
#The first step is to convert our edge table into a tbl_graph object structure.
#Here, we use the as_tbl_graph() function from tidygraph; it can take many different types of input data, like data.frame, matrix, dendrogram, igraph, etc.
#creating table for graph manipulation
as_tbl_graph(marvel_conversation, directed = FALSE)
#Using the activate() function that allows us to remove multiple edges.
as_tbl_graph(marvel_conversation, directed = FALSE) %>% activate(edges) %>% filter(!edge_is_multiple())
#Node ranking
#Often, especially when visualising networks with certain layouts, the order in which the nodes appear will have a huge influence on the insight you can get out (e.g. matrix plots and arc diagrams).
?node_rank
#Minimize hamiltonian path length using a travelling salesperson solver.
#or we can use any other seriation algorithm
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(n_rank_trv = node_rank_traveller()) %>%
arrange(n_rank_trv)
########################################################
#Centrality
########################################################
# ?centrality
#Centrality describes the number of edges that are in- or outgoing to/from nodes.
#High centrality networks have few nodes with many connections, low centrality networks have many nodes with similar numbers of edges.
#The centrality of a node measures the importance of it in the network.
#using Betweeness Centrality
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(neighbors = centrality_betweenness()) %>%
arrange(-neighbors)
#using Degree Centrality
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(neighbors = centrality_degree()) %>%
arrange(-neighbors)
########################################################
#Grouping and clustering
########################################################
#Group nodes by minimizing description length using.
#We can use ?group_graph for an overview about all possible ways to cluster and group nodes.
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(group = group_infomap()) %>%
arrange(-group)
########################################################
#Querying node types
########################################################
#I am trying out node_is_center() (does the node have the minimal eccentricity in the graph)
#and node_is_keyplayer() to identify the top 10 key-players in the network.
#install.packages('influenceR')
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(center = node_is_center(),
keyplayer = node_is_keyplayer(k = 10))
########################################################
#Node pairs
########################################################
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(nodes) %>%
mutate(dist_to_center = node_distance_to(node_is_center()))
########################################################
#Edge betweenness
########################################################
#Similarly to node metrics, we can calculate all kinds of edge metrics.
#Betweenness, for example, describes the shortest paths between nodes.
#More about what you can do with edges can be found with ?edge_types and in the tidygraph manual.
as_tbl_graph(marvel_conversation, directed = FALSE) %>%
activate(edges) %>%
mutate(centrality_e = centrality_edge_betweenness())
########################################################
#Combining everything in true tidy verse fashion:
########################################################
marvel_conversation_graph <- as_tbl_graph(marvel_conversation, directed = FALSE) %>%
mutate(n_rank_trv = node_rank_traveller(),
neighbors = centrality_degree(),
group = group_infomap(),
center = node_is_center(),
dist_to_center = node_distance_to(node_is_center()),
keyplayer = node_is_keyplayer(k = 10)) %>%
activate(edges) %>%
filter(!edge_is_multiple()) %>%
mutate(centrality_e = centrality_edge_betweenness())
########################################################
#We can also convert our active node or edge table back to a tibble:
########################################################
marvel_conversation_graph %>%
activate(nodes) %>% # %N>%
as.tibble()
marvel_conversation_graph %>%
activate(edges) %>% # %E>%
as.tibble()
########################################################
#Plotting with ggraph
########################################################
#define a layout, here I am using a Fruchterman-Reingold algorithm.
layout <- create_layout(marvel_conversation_graph, layout = "fr")
#The rest works like any ggplot2 function call, just that we use special geoms for our network,
#like geom_edge_density() to draw a shadow where the edge density is higher, geom_edge_link() to connect edges
#with a straight line, geom_node_point() to draw node points and geom_node_text() to draw the labels.
ggraph(layout) +
geom_edge_density(aes(fill = Weight)) +
geom_edge_link(aes(width = Weight), alpha = 0.2) +
geom_node_point(aes(color = factor(group)), size = 10) +
geom_node_text(aes(label = name), size = 8, repel = TRUE) +
scale_color_brewer(palette = "Set1") +
theme_graph() +
labs(title = "Marvel Universe Network",
subtitle = "Nodes are colored by group")
########################################################
#For the next graphs, I want specific colors form the RColorBrewer palette "Set1":
########################################################
cols <- RColorBrewer::brewer.pal(3, "Set1")
ggraph(layout) +
geom_edge_density(aes(fill = Weight)) +
geom_edge_link(aes(width = Weight), alpha = 0.2) +
geom_node_point(aes(color = factor(center), size = dist_to_center)) +
geom_node_text(aes(label = name), size = 8, repel = TRUE) +
scale_colour_manual(values = c(cols[2], cols[1])) +
theme_graph() +
labs(title = "Marvel Universe",
subtitle = "Nodes are colored by centeredness")
library(tidyverse) # tidy data analysis
library(tidygraph) # tidy graph analysis
library(igraph)
#Importing all csv Files from a Folder
#Importing all csv Files from a Folder
marvel <- read_csv("D:/Courses/02_Socal network Analytics/Harsh Study Material/Final Project/marvel.csv")
marvel
marvel_nodes <- read_csv("D:/Courses/02_Socal network Analytics/Harsh Study Material/Final Project/marvel_nodes.csv")
marvel_nodes
marvel$Source <- marvel_nodes$Label[match(marvel$Source, marvel_nodes$Id)]
marvel$Target <- marvel_nodes$Label[match(marvel$Target, marvel_nodes$Id)]
marvel <- marvel[c("Source","Target","Type","Id","Weight")]
marvel_ch <- marvel %>%
select(-Type) %>%
gather(x, name, Source:Target) %>%
group_by(name) %>%
summarise(sum_weight = sum(Weight)) %>%
ungroup()
marvel_ch_l <- marvel_ch %>%arrange(desc(sum_weight)) %>%top_n(100, sum_weight)
# creating conversation table between top 100 characters
marvel_conversation <- marvel %>%filter(Source %in% marvel_ch_l$name & Target %in% marvel_ch_l$name)
#correct one input mistake
marvel_conversation %>%
group_by(Target)%>%
summarise(Weight2 = sum(Weight)) %>%
arrange(desc(Weight2))
marvel_conversation[which(marvel_conversation$Target == ',"CAPTAIN AMERICA'),]$Target <- "CAPTAIN AMERICA"
#Sort the dataframe by name
m <- marvel_conversation %>% arrange(Source, desc(Source))
#Calculate the total weight for each node
m2 <- m %>% group_by(Source) %>% summarise(Weight1 = sum(Weight))
m3 <- m %>% group_by(Target) %>% summarise(Weight2 = sum(Weight))
m4 <- cbind(m2, m3)
m4 <- m4 %>% group_by(Source) %>% summarise(Weight = Weight1 + Weight2)
#weight factor to attribute of node
mw <- m4$Weight
V(g)$Weight = mw
#We might wanna cut some edges
#max * 0.025 is the magic number we used here
#######********########
g12 <- delete_edges(g, E(g)[Weight<max(E(g)$Weight)*0.025])
#######********########
# degree and degree distribution
dm <- degree(g12)
V(g12)$degree <- dm
deg.dist <- degree_distribution(g12, cumulative=T, mode="all")
plot( x=0:max(degree(g12)), y=deg.dist, pch=19, cex=1.2, col="orange",
xlab="Degree", ylab="Cumulative Frequency")
hist(dm, breaks=1:vcount(g12)-1, main="Histogram of node degree",
col = "orange", border = "black")
#compute betweenness
bm <- betweenness(g12)
hist(bm, breaks = vcount(g12), main = "Histogram of node betweenness",
col = "orange", border = "black")
V(g12)$be <- bm
V(g12)$be[1]
#build communities for our heroes
g13 <- simplify(g12)
community = fastgreedy.community(g13)
max(community$membership)
#the node of g12 and g13 has no different, so lets give community's attribute to g12 nodes
V(g12)$com <- community$membership
V(g12)$color <- community$membership
#also we can use other method to color our node, like weight, degree or betweenness
node_color = ifelse(V(g12)$Weight >= quantile(V(g12)$Weight)[[4]], 'red', 'gray50')
node_color_d = ifelse(V(g12)$degree >= quantile(V(g12)$degree)[[4]], 'orange', 'royalblue')
node_color_b = ifelse(V(g12)$be >= quantile(V(g12)$be)[[4]], 'magenta', 'ivory4')
#and set nodes' size to weight, degree and betweenness as well
node_size = V(g12)$Weight * 10/ max(V(g12)$Weight) + 1
node_size_d = V(g12)$degree * 10/ max(V(g12)$degree) + 1
node_size_b = V(g12)$be * 10/ max(V(g12)$be) + 1
#don't forget the label size
label_size = V(g12)$Weight * 2/ max(V(g12)$Weight) + 1
label_size_d = V(g12)$degree * 2/ max(V(g12)$degree) + 1
label_size_b = V(g12)$be * 2/ max(V(g12)$be) + 1
#plot community
set.seed(1)
plot(g12, vertex.size = node_size,
mark.groups = by(seq_along(community$membership), community$membership, invisible),
layout=layout.fruchterman.reingold,vertex.label=V(g12)$name, vertex.label.color = "black",
vertex.label.cex = label_size,edge.color = "gray30")
#plot degree
set.seed(1)
plot(g13, vertex.color = node_color_d, layout = layout.fruchterman.reingold,
vertex.label = V(g13)$name, vertex.label.color = "black", vertex.label.cex = label_size_d,
vertex.size = node_size_d, edge.color = "gray30")
#plot weight
set.seed(1)
plot(g13, vertex.color = node_color , layout = layout.fruchterman.reingold, vertex.label = V(g13)$name,
vertex.label.color = "black", vertex.label.cex = label_size,
edge.color = "gray30", vertex.size = node_size)
#plot betweenness
set.seed(1)
plot(g13, vertex.color = node_color_b , layout = layout.fruchterman.reingold, vertex.label = V(g13)$name,
vertex.label.color = "black", vertex.label.cex = label_size_b,
edge.color = "gray30", vertex.size = node_size_b)
# tie strength between two main characters
sort(degree(g12), decreasing = T)[1:10]
edge_connectivity(graph = g, target = 83, source = 40)
neighbors(graph = g12, v = which(V(g)$name == "SPIDER-MAN/PETER PAR"))
neighbors(graph = g12, v = which(V(g)$name == "IRON MAN/TONY STARK"))
length(intersect(neighbors(graph = g12, v = 83), neighbors(graph = g12, v = 40))) / length(union(neighbors(graph = g12, v = 83), neighbors(graph = g12, v = 40)))
#edge prediction
shortest_paths_sim = -distances(g12)
adamic_adar_sim = similarity.invlogweighted(g12)
jaccard_sim = similarity.jaccard(g12)
# name the rows and cols of the similarity matrices using node names
node_names = as.vector(V(g12)$name)
rownames(shortest_paths_sim) = node_names
colnames(shortest_paths_sim) = node_names
rownames(adamic_adar_sim) = node_names
colnames(adamic_adar_sim) = node_names
rownames(jaccard_sim) = node_names
colnames(jaccard_sim) = node_names
recommend_links = function(node_name, sim_matrix, n_recommend = 10){
similarity_to_node = sim_matrix[toString(node_name), ]
ranked_nodes_by_sim =
names(similarity_to_node[order(similarity_to_node, decreasing = T)])
neighbours_in_train = c(toString(node_name), as.vector(neighbors(g13, toString(node_name))$name))
recommend_ids = setdiff(ranked_nodes_by_sim, neighbours_in_train)[1:n_recommend]
return(recommend_ids)
}
my_answer_s = lapply(V(g13)$name, recommend_links, sim_matrix = shortest_paths_sim, n_recommend = 10)
my_answer_j = lapply(V(g13)$name, recommend_links, sim_matrix = jaccard_sim, n_recommend = 10)
my_answer_a = lapply(V(g13)$name, recommend_links, sim_matrix = adamic_adar_sim, n_recommend = 10)
my_answer_s[[which(V(g13)$name == "SPIDER-MAN/PETER PAR")]]
my_answer_j[[which(V(g13)$name == "SPIDER-MAN/PETER PAR")]]
my_answer_a[[which(V(g13)$name == "SPIDER-MAN/PETER PAR")]]
my_answer_s[[which(V(g13)$name == "THOR/DR. DONALD BLAK")]]
my_answer_j[[which(V(g13)$name == "THOR/DR. DONALD BLAK")]]
my_answer_a[[which(V(g13)$name == "THOR/DR. DONALD BLAK")]]
my_answer_s[[which(V(g13)$name == "BEAST/HENRY &HANK& P")]]
my_answer_j[[which(V(g13)$name == "BEAST/HENRY &HANK& P")]]
my_answer_a[[which(V(g13)$name == "BEAST/HENRY &HANK& P")]]
# 2-degree network of some characters
sort(bm, decreasing = T)[1:10]
n_m = function(hn, hc){
nm0 <- graph.neighborhood(g12, order = 1,nodes = which(V(g12)$name == hn))[[1]]
plot(nm0, layout = layout.fruchterman.reingold, vertex.label = V(nm0)$name,
edge.color = "gray30", vertex.color = ifelse(V(nm0)$name == hn, hc, V(nm0)$com),
vertex.label.cex = 5, vertex.label.color = "black")
}
n_m("BEAST/HENRY &HANK& P", "lightblue")
n_m("SPIDER-MAN/PETER PAR", "red")
n_m("THOR/DR. DONALD BLAK", "steelblue")