# Network Graph Visualization of Reddit Subreddits

Notebook based in the notebook created by Max Woolf (@minimaxir): See it on GitHub https://github.com/minimaxir/reddit-graph

In [85]:
Sys.setenv(R_GSCMD = "C:/Program Files/gs/gs9.26/bin/gswin64c.exe")

In [86]:
source("Rstart.R")

library(sna)
library(ggnetwork)
library(svglite)
library(igraph)
library(intergraph)   # convert igraph to network
library(rsvg)   # convert svg to pdf

sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17763)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Spain.1252  LC_CTYPE=Spanish_Spain.1252   
[3] LC_MONETARY=Spanish_Spain.1252 LC_NUMERIC=C                  
[5] LC_TIME=Spanish_Spain.1252    

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] gdtools_0.1.7        bindrcpp_0.2.2       rsvg_1.3            
 [4] intergraph_2.0-2     igraph_1.2.2         svglite_1.2.1       
 [7] ggnetwork_0.5.1      sna_2.4              network_1.13.0.1    
[10] statnet.common_4.1.4 stringr_1.3.1        digest_0.6.15       
[13] RColorBrewer_1.1-2   scales_0.5.0         extrafont_0.17      
[16] ggplot2_3.0.0        dplyr_0.7.6          readr_1.3.0         

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-3     tidyselect_0.2.4 repr_0.15.0      purrr_0.2.5     
 [5] lattice_0.20-3

Load edgelist into R and preprocess.

In [87]:
#file_name <- "subreddit_edges.csv"
file_name <- "edge_list.csv"

#df <- read_csv(file_name) %>% arrange(Source, Target)
df <- read_csv(file_name) %>% arrange(Subreddit1, Subreddit2)
print(head(df))

Parsed with column specification:
cols(
  Subreddit1 = col_character(),
  Subreddit2 = col_character(),
  Weight = col_double()
)


# A tibble: 6 x 3
  Subreddit1 Subreddit2 Weight
  <chr>      <chr>       <dbl>
1 2007scape  funny        1113
2 2007scape  gaming       1070
3 2007scape  gifs          963
4 2007scape  news          973
5 2007scape  pics         1209
6 2007scape  politics     1212


In [88]:
#defaults <- c("announcements","art","askreddit","askscience","aww","blog",
 #            "books","creepy","dataisbeautiful","diy","documentaries","earthporn",
  #           "explainlikeimfive","fitness","food","funny","futurology","gadgets",
   #          "gaming","getmotivated","gifs","history","iama","internetisbeautiful",
    #         "jokes","lifeprotips","listentothis","mildlyinteresting","movies","music",
     #        "news","nosleep","nottheonion","oldschoolcool","personalfinance",
      #       "philosophy","photoshopbattles","pics","science","showerthoughts",
       #      "space","sports","television","tifu","todayilearned","twoxchromosomes","upliftingnews",
        #     "videos","worldnews","writingprompts")

defaults <- c('AskReddit', 'news', 'funny', 'AdviceAnimals', 'Showerthoughts',
              'The_Donald', 'CFB', 'gaming', 'pics', 'gifs', 'Overwatch', 'WTF',
              'Futurology', 'politics', 'aww', 'BlackPeopleTwitter', 'movies',
              'Jokes', 'CringeAnarchy', 'Games', 'LifeProTips', 'AskMen',
              'OldSchoolCool', 'Documentaries', 'EnoughTrumpSpam', 'nfl', 'IAmA',
              'Android', 'nba', 'MMA')

#df <- df %>% mutate(connectDefault = ifelse(Source %in% defaults | Target %in% defaults, T, F))
df <- df %>% mutate(connectDefault = ifelse(Subreddit1 %in% defaults | Subreddit2 %in% defaults, T, F))
print(tail(df))

# A tibble: 6 x 4
  Subreddit1 Subreddit2    Weight connectDefault
  <chr>      <chr>          <dbl> <lgl>         
1 WTF        trees           6223 TRUE          
2 WTF        unitedkingdom   1321 TRUE          
3 WTF        videos         20654 TRUE          
4 WTF        worldnews      19447 TRUE          
5 WTF        wow             2645 TRUE          
6 WTF        xboxone         2424 TRUE          


In [89]:
net <- graph.data.frame(df, directed=F)

print(net)

IGRAPH 1c895e6 UN-- 146 3993 -- 
+ attr: name (v/c), Weight (e/n), connectDefault (e/l)
+ edges from 1c895e6 (vertex names):
 [1] 2007scape--funny              2007scape--gaming            
 [3] 2007scape--gifs               2007scape--news              
 [5] 2007scape--pics               2007scape--politics          
 [7] 2007scape--The_Donald         2007scape--todayilearned     
 [9] 2007scape--videos             2007scape--worldnews         
[11] 4chan    --AdviceAnimals      4chan    --Android           
[13] 4chan    --AskMen             4chan    --atheism           
[15] 4chan    --aww                4chan    --BlackPeopleTwitter
+ ... omitted several edges


Calculate degree, and remove nodes with only 1 or 2 neighbors for graphing simplicity.

In [90]:
V(net)$degree <- centralization.degree(net)$res
net <- igraph::delete.vertices(net, V(net)[degree < 3])

print(net)

IGRAPH 1c8ee4b UN-- 139 3983 -- 
+ attr: name (v/c), degree (v/n), Weight (e/n), connectDefault (e/l)
+ edges from 1c8ee4b (vertex names):
 [1] 2007scape--funny              2007scape--gaming            
 [3] 2007scape--gifs               2007scape--news              
 [5] 2007scape--pics               2007scape--politics          
 [7] 2007scape--The_Donald         2007scape--todayilearned     
 [9] 2007scape--videos             2007scape--worldnews         
[11] 4chan    --AdviceAnimals      4chan    --Android           
[13] 4chan    --AskMen             4chan    --atheism           
[15] 4chan    --aww                4chan    --BlackPeopleTwitter
+ ... omitted several edges


Add more summary statistics to the nodes.

In [91]:
V(net)$group <- membership(cluster_walktrap(net, weights=E(net)$Weight))
V(net)$centrality <- eigen_centrality(net, weights=E(net)$Weight)$vector
V(net)$defaultnode <- V(net)$name %in% defaults

print(head(data.frame(V(net)$name, V(net)$degree, V(net)$centrality, V(net)$group, V(net)$defaultnode)))

    V.net..name V.net..degree V.net..centrality V.net..group V.net..defaultnode
1     2007scape            10       0.018819632            1              FALSE
2         4chan            71       0.189046072            2              FALSE
3 AdviceAnimals           119       0.640145222            1               TRUE
4           Amd             4       0.005795301            3              FALSE
5       Android            69       0.182583128            2               TRUE
6         anime            24       0.048642025            2              FALSE


## Triying to color by topic

## Adding colors

Long string of code to generate color palette and assign to nodes and edges. Generate a color for a group from solid ColorBrewer colors.

In [92]:
color_pool <- c(brewer.pal(9, "Blues")[6:9],
                brewer.pal(9, "Reds")[6:9],
                brewer.pal(9, "Greens")[6:9],
                brewer.pal(9, "Purples")[6:9])

n_colors <- max(V(net)$group)
set.seed(42)
palette <- data.frame(group=1:n_colors, colors=sample(color_pool, n_colors, replace=T), stringsAsFactors=FALSE)

V(net)$colornode <- palette[V(net)$group, 2]
                   
print(head(palette))

  group  colors
1     1 #54278F
2     2 #54278F
3     3 #EF3B2C
4     4 #6A51A3
5     5 #006D2C
6     6 #41AB5D


Prepare data frame for merging. (to find edges with are in the same group)

In [93]:
# http://stackoverflow.com/questions/21243965/igraph-get-edge-from-to-value

df_edges <- tbl_df(data.frame(get.edgelist(net), stringsAsFactors=FALSE))
df_vertices <- tbl_df(data.frame(name=V(net)$name, color=V(net)$colornode, group=V(net)$group, stringsAsFactors=FALSE))

print(head(df_edges))
print(head(df_vertices))

# A tibble: 6 x 2
  X1        X2      
  <chr>     <chr>   
1 2007scape funny   
2 2007scape gaming  
3 2007scape gifs    
4 2007scape news    
5 2007scape pics    
6 2007scape politics
# A tibble: 6 x 3
  name          color   group
  <chr>         <chr>   <dbl>
1 2007scape     #54278F     1
2 4chan         #54278F     2
3 AdviceAnimals #54278F     1
4 Amd           #EF3B2C     3
5 Android       #54278F     2
6 anime         #54278F     2


In [94]:
default_edge_color <- "#cccccc"

df_edges <- df_edges %>% left_join(df_vertices, by=c("X1"="name")) %>% left_join(df_vertices, by=c("X2"="name"))
E(net)$coloredge <- ifelse(df_edges$group.x==df_edges$group.y, df_edges$color.x, default_edge_color)

print(head(df_edges))

# A tibble: 6 x 6
  X1        X2       color.x group.x color.y group.y
  <chr>     <chr>    <chr>     <dbl> <chr>     <dbl>
1 2007scape funny    #54278F       1 #54278F       1
2 2007scape gaming   #54278F       1 #54278F       1
3 2007scape gifs     #54278F       1 #54278F       1
4 2007scape news     #54278F       1 #54278F       1
5 2007scape pics     #54278F       1 #54278F       1
6 2007scape politics #54278F       1 #54278F       1


Build the network layout. 50,000 iterations is enough for layout convergence.

In [95]:
df_net <- ggnetwork(net, layout = "fruchtermanreingold", weights="Weight", niter=50000)

write.csv(df_net, "df_net.csv", row.names=F)
print(head(df_net))

           x          y  centrality colornode defaultnode degree group  na.x
1 0.92167078 0.45221211 0.018819632   #54278F       FALSE     10     1 FALSE
2 0.00000000 0.40214640 0.189046072   #54278F       FALSE     71     2 FALSE
3 0.38156908 0.29568797 0.640145222   #54278F        TRUE    119     1 FALSE
4 0.66238421 1.00000000 0.005795301   #EF3B2C       FALSE      4     3 FALSE
5 0.01925663 0.48094565 0.182583128   #54278F        TRUE     69     2 FALSE
6 0.58855029 0.07645285 0.048642025   #54278F       FALSE     24     2 FALSE
   vertex.names       xend       yend coloredge connectDefault na.y Weight
1     2007scape 0.92167078 0.45221211      <NA>             NA   NA     NA
2         4chan 0.00000000 0.40214640      <NA>             NA   NA     NA
3 AdviceAnimals 0.38156908 0.29568797      <NA>             NA   NA     NA
4           Amd 0.66238421 1.00000000      <NA>             NA   NA     NA
5       Android 0.01925663 0.48094565      <NA>             NA   NA     NA
6         a

In [96]:
df_net_defaults = df_net[which(df_net$default),]
print(head(df_net_defaults))

            x          y centrality colornode defaultnode degree group  na.x
3  0.38156908 0.29568797 0.64014522   #54278F        TRUE    119     1 FALSE
5  0.01925663 0.48094565 0.18258313   #54278F        TRUE     69     2 FALSE
8  0.17349885 0.68256275 0.10854552   #54278F        TRUE     50     2 FALSE
13 0.32020879 0.31295952 0.55023134   #54278F        TRUE    109     2 FALSE
17 0.22926710 0.31936543 0.35741572   #54278F        TRUE    100     2 FALSE
25 0.29637013 0.07384898 0.09730195   #54278F        TRUE     45     2 FALSE
         vertex.names       xend       yend coloredge connectDefault na.y
3       AdviceAnimals 0.38156908 0.29568797      <NA>             NA   NA
5             Android 0.01925663 0.48094565      <NA>             NA   NA
8              AskMen 0.17349885 0.68256275      <NA>             NA   NA
13                aww 0.32020879 0.31295952      <NA>             NA   NA
17 BlackPeopleTwitter 0.22926710 0.31936543      <NA>             NA   NA
25               

We will color the nodes whether or not they are a default subreddit (orange if default, blue otherwise) and color the lines accordingly (orange if either end is a default subreddit, blue otherwise).

In [97]:
default_colors=c("#3498db", "#e67e22")
default_labels=c("Not Default", "Default")

svglite("subreddit-1.svg", width=10, height=8)  
  ggplot(df_net, aes(x = x, y = y, xend = xend, yend = yend, size = centrality)) +
    geom_edges(aes(color = connectDefault), size=0.05) +
    geom_nodes(aes(fill = defaultnode), shape = 21, stroke=0.2, color="black") +
    geom_nodelabel_repel(data=df_net, aes(color = defaultnode, label = vertex.names),
                          fontface = "bold", size=0.5, box.padding = unit(0.05, "lines"),
                          label.padding= unit(0.1, "lines"), segment.size=0.1, label.size=0.2) +
    scale_color_manual(values=default_colors, labels=default_labels, guide=F) +
    scale_fill_manual(values=default_colors, labels=default_labels) +
    ggtitle("Network Graph of Reddit Subreddits") +
    scale_size(range=c(0.1, 4)) + 
    theme_blank()
dev.off()

rsvg_pdf("subreddit-1.svg", "subreddit-1.pdf")

"Ignoring unknown parameters: segment.color"

Color by group: if an edge links to a node of the same group, the edge is colored that group. Otherwise, the edge is colored gray. 

In [98]:
svglite("subreddit-2.svg", width=10, height=8)  
  ggplot(df_net, aes(x = x, y = y, xend = xend, yend = yend, size = centrality)) +
  geom_edges(aes(color = coloredge), size=0.05) +
  geom_nodes(aes(fill = colornode), shape = 21, stroke=0.2, color="black") +
     geom_nodelabel_repel(data=df_net, aes(color = colornode, label = vertex.names),
                       fontface = "bold", size=0.5,
                    box.padding = unit(0.05, "lines"), label.padding= unit(0.1, "lines"), segment.size=0.1, label.size=0.2) +
    scale_color_identity("colornode", guide=F) +
    scale_fill_identity("colornode", guide=F) +
    scale_size(range=c(0.2, 3), guide=F) +
    ggtitle("Network Graph of Reddit Subreddits") +
  theme_blank()
dev.off()

rsvg_pdf("subreddit-2.svg", "subreddit-2.pdf")

"Ignoring unknown parameters: segment.color"

For the subgroups, use a function that prints a visualization of the subgraph for *each* group. Since the groups are in random order, sort out the important ones later.

NB: Since output is a PNG with fixed dimensions and not a SVG, the style parameters of graph aestetics must be changed.

In [99]:
subreddit_graph_subset <- function(group_number) {

df_network <- df_net[which(df_net$group==group_number),]

plot <- 
  ggplot(df_network, aes(x = x, y = y, xend = xend, yend = yend, size = centrality)) +
  geom_edges(data=df_network[which(df_network$coloredge!=default_edge_color),], aes(color = coloredge), size=0.05) +
  geom_nodes(aes(fill = colornode), shape = 21, stroke=0.5, color="black") +
    geom_nodelabel_repel(data=df_network, aes(color = colornode, label = vertex.names),
                       fontface = "bold", family="Open Sans Condensed", size=1.5,
                    box.padding = unit(0.10, "lines"), label.padding= unit(0.1, "lines"), segment.size=0.1, label.size=0.5, label.r=unit(0.15, "lines")) +
    scale_color_identity("colornode", guide=F) +
    scale_fill_identity("colornode", guide=F) +
    scale_size(range=c(0.2, 6), guide=F) +
    ggtitle(sprintf("Network Subgraph of Group %s Subreddits",group_number)) +
  theme_blank(base_size=7, base_family="Source Sans Pro")
    
ggsave(sprintf("subreddit-groups/group-%03d.png", group_number), plot, width=4, height=3, dpi=300)

}

In [100]:
x <- lapply(1:max(V(net)$group), subreddit_graph_subset)

"font family not found in Windows font database"