# 1. Load in libraries and define functions

In [14]:
library(arrow)
library(lubridate)
library(data.table)
library(ggplot2)
library(stringr)
library(aricode)
library(dplyr)
library(bit64)
library(scales)
library(Matrix)
library(vsp)
library(tidyr)
library(Hmisc)
library(patchwork)
theme_set(theme_bw(20))
options(arrow.skip_nul = TRUE)

In [3]:
gen_vals <- function(y_or_z,k){
    return(paste0(y_or_z,str_pad(1:k,floor(log10(k)) + 1, pad="0"))) 
}
    
run_vsp <- function(data, nodes_name, items_name, y_mg_df=NULL, 
                    z_mg_df=NULL, weight_col=NULL, 
                    k=25,center=F){
    
    node_df <- data.table(node=unique(data[,get(nodes_name)]),
                    node_idv=1:length(unique(data[,get(nodes_name)])))
    item_df <- data.table(item=unique(data[,get(items_name)]),
                    item_idv = 1:length(unique(data[,get(items_name)])))

    node_item_net <- merge(data,node_df,by.x=nodes_name, by.y="node")
    node_item_net <- merge(node_item_net,item_df,by.x=items_name,by.y="item")
    
    mat <- sparseMatrix(i = node_item_net$node_idv,
                        j = node_item_net$item_idv,
                        x = ifelse(is.null(weight_col), 1, node_item_net[,get(weight_col)])
                        )

    fa <- vsp(mat, rank = k, center=center)
    
    y <- data.table(get_varimax_y(fa))
    y$item_idv <- as.integer(sub("col","",y$id))
    ymg <- merge(item_df, y, by="item_idv")
    setnames(ymg, "item",items_name)
    ymg$grp <- unname(apply(ymg[,gen_vals("y",k),with=F], 1, which.max))
    if(!is.null(y_mg_df)){
        ymg <- merge(ymg, 
             y_mg_df,
             by=items_name)
    }
    
    z <- data.table(get_varimax_z(fa))
    z$node_idv <- as.integer(sub("row","",z$id))
    zmg <- merge(node_df, z, by="node_idv")
    setnames(zmg, "node",nodes_name)
    zmg$grp <- unname(apply(zmg[,gen_vals("z",k),with=F], 1, which.max))
    if(!is.null(z_mg_df)){
        zmg <- merge(zmg, 
             z_mg_df,
             by=nodes_name)
    }
    return(list(vsp_res=fa,
                ymg = ymg,
                zmg = zmg
                )
           )

}


# 2. Clustering on RT - UID to identify Bernie Supporters 

In other words, run VSP on the who retweets whom network

In [4]:
user_data <- rbindlist(lapply(Sys.glob("/data/dnc2020/user_descript/part-*.parquet"),read_parquet))
user_data <- user_data[!duplicated(uid)]

“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”
“Stripping '\0' (nul) from character vector”


In [7]:
rt_uid_net <-rbindlist(lapply(Sys.glob("data/trimmed_rtnet/part-*.parquet"), read_parquet))

In [None]:
clust_len <- c(5,10,15,20,25,30, 35,40,45,50,55,60)

# get all clusterings
reslist <- list()
for(k in clust_len){
    print(k)
    res <- run_vsp(rt_uid_net,
               nodes_name="uid",
               items_name="rt_uid",
               weight_col = "rt_times",
               z_mg_df = user_data,
               k=k,
               center=F)
    res$ymg <- merge(res$ymg, user_data, by.x="rt_uid", by.y="uid")
    reslist[[k]] <- res
}


In [730]:
# compute AMIs/ARIs
ami_dat <- data.frame()
for(i in 1:(length(clust_len)-1)){
    for(j in (i+1):length(clust_len)){
        r1 <- reslist[[clust_len[i]]]$zmg[,.(uid, grp)]
        r2 <- reslist[[clust_len[j]]]$zmg[,.(uid, grp)]
        mg <- merge(r1,r2, by='uid')
        ami_dat <- rbind(ami_dat, data.frame(i=clust_len[i],j=clust_len[j], 
                                             ami=AMI(mg$grp.x, mg$grp.y),
                                             ari=ARI(mg$grp.x,mg$grp.y),
                                             nmi = NMI(mg$grp.x,mg$grp.y)))
        ami_dat <- rbind(ami_dat, data.frame(i=clust_len[j],j=clust_len[i], 
                                             ami=AMI(mg$grp.x, mg$grp.y),
                                             ari=ARI(mg$grp.x,mg$grp.y),
                                             nmi = NMI(mg$grp.x,mg$grp.y)))
    }
}

“You are trying to join data.tables where 'y' argument is 0 columns data.table.”


ERROR: Error in merge.data.table(r1, r2, by = "uid"): Elements listed in `by` must be valid column names in x and y


In [738]:
options(repr.plot.width = 5, repr.plot.height = 5)

ami_dat <- data.table(ami_dat)

res <- ami_dat[, as.list(smean.cl.boot(ami,conf.int = .66)), by=i]


In [737]:

pl <- ggplot(res, aes(factor(i), Mean, ymin=Lower, ymax=Upper)) +
geom_pointrange() + 
ylab("Adjusted Mutual Information") +
xlab("Setting for k in VSP")
ggsave("ami_whortwho.pdf", h=5,w=5)

In [84]:
library(stringr)
library(writexl)
kval = 35
file_name <- paste0("data/results_new_",kval,".xlsx")

sheets <- list()

for(grp_num in 1:kval){
    grp_str <- paste0("y",str_pad(grp_num, width = 2, "left","0"))
    influencers <- reslist[[k]]$ymg[grp == grp_num][order(-get(grp_str))][1:30][,.(rt_uid,username, description)]
    influencers$utype <- "influencer"
    setnames(influencers, "rt_uid", "uid")
    
    grp_str <- paste0("z",str_pad(grp_num, width = 2, "left","0"))
    ord <- reslist[[k]]$zmg[grp == grp_num][order(-get(grp_str))][1:30][,.(uid,username, description)]
    ord$utype <- "ordinary"
    
    dat <- rbind(influencers, ord)
    dat$url <- paste0("https://twitter.com/intent/user?user_id=", dat$uid)
    dat$uid <- NULL
    sheets[[paste("Group", grp_num)]] = dat
}
    
write_xlsx(sheets, file_name)


In [19]:
save(reslist, file="data/rtnet_clustering.rdata")

# 3. Perform qualitative analysis

Our analysis of AMI plots suggested that 35 clusters was an appropriate number for analysis.  Moreover, our qualitative analysis, described in the paper, surfaces groups 3, 5, 11, 19, and 27 as the groups that contained predominantly Bernie supporters. We therefore shift to focusing on these clusters of Twitter users, with `k=35` for VSP, in the code below.

# 4. Write out Bernie supporting accounts

In [8]:
nrow(reslist[[35]]$zmg[grp %in% c(3,5,11,19,27)])

In [16]:
write.table(reslist[[35]]$zmg[grp %in% c(3,5,11,19,27),"uid",with=F], 
            "/data/dnc2020/bernie_vsp.csv", row.names=F,  quote=F)

In [59]:
write.table(reslist[[35]]$zmg[,.(uid,grp)], 
            "/data/dnc2020/bernie_clustering.csv", 
            row.names=F,  
            quote=F)

In [14]:
write.table(d, "data/bernie_vsp_yz.csv", row.names=F,  quote=F)

# 5. Clustering on Who Retweets What



In [None]:
# Reload, in case you want to start here
load("data/rtnet_clustering.rdata")



In [6]:
### Note::: This data gets written out in Step 1 of the script
# 04_post_vsp_dataprocessing.ipynb. Sorry, there wasn't a clean
# way to represent this back and forth without making things
# more confusing (IMO).
rt_text <-rbindlist(lapply(Sys.glob("/data/dnc2020/tweets_clustering/part-*.parquet"), 
                           read_parquet,col_select=c("rt_id","text","created_at")))
rt_text <- rt_text[!duplicated(rt_id)]
rt_text <- rt_text[!is.na(rt_id)]
dim(rt_text)

In [7]:
bernie_tweets <-rbindlist(lapply(Sys.glob("/data/dnc2020/tweets_clustering/part-*.parquet"), 
                     read_parquet,
                     col_select=c("uid","created_at","rt_id")))
bernie_tweets[, period := ifelse(created_at < as.numeric(as.POSIXct("2020-03-02 11:59:59 EST")),
                                  "Pre Super Tuesday",
           ifelse(created_at < as.numeric(as.POSIXct("2020-08-16 11:59:59 EST")),
                                  "Pre Convention", "Post Convention"))]
bernie_tweets[, dt := as.Date(as_datetime(created_at))]
bernie_tweets <- bernie_tweets[!is.na(rt_id)]
bernie_tweets[, uid:= as.integer64(uid)]

In [8]:
length(unique(bernie_tweets$uid))

In [9]:
rt_cnt_period <- bernie_tweets[,.N,by=.(rt_id,period)]
rt_cnt_period[N > 100, .N, by=period]

period,N
<chr>,<int>
Pre Convention,23737
Post Convention,8849
Pre Super Tuesday,15893


In [10]:
rt_cnt <- bernie_tweets[,.N,by=.(rt_id)]
rtmin <- bernie_tweets[rt_id %in% rt_cnt[N > 100,]$rt_id]
rtm <- rtmin[, length(unique(rt_id)), by=.(uid)]

In [11]:
pre_u <- rtm[V1 > 10]$uid
pre_rt <- rt_cnt[N > 250]$rt_id

In [12]:
length(pre_u)

In [13]:
length(pre_rt)

In [14]:
bernie_min <- bernie_tweets[rt_id %in% pre_rt & uid %in% pre_u]

In [None]:
clust_len <- c(5,8,10,12,15,18,20,23,25,28,30)

# get all clusterings
bernie_reslist <- list()
for(k in clust_len){
    print(k)
    res <- run_vsp(bernie_min,
               nodes_name="uid",
               items_name="rt_id",
               y_mg_df = rt_text,
               z_mg_df = user_data,
               k=k,
               center=T)
    bernie_reslist[[k]] <- res
}


In [34]:
save(bernie_reslist, file= "data/rt_tweet_clustering.rdata")

In [740]:
# compute AMIs/ARIs
clust_len <- c(5,8,10,12,15,18,20,23,25,28,30)

ami_dat <- data.frame()
for(i in 1:(length(clust_len)-1)){
    for(j in (i+1):length(clust_len)){
        r1 <- bernie_reslist[[clust_len[i]]]$zmg[,.(uid, grp)]
        r2 <- bernie_reslist[[clust_len[j]]]$zmg[,.(uid, grp)]
        mg <- merge(r1,r2, by='uid')
        ami_dat <- rbind(ami_dat, data.frame(i=clust_len[i],j=clust_len[j], 
                                             ami=AMI(mg$grp.x, mg$grp.y),
                                             ari=ARI(mg$grp.x,mg$grp.y),
                                             nmi = NMI(mg$grp.x,mg$grp.y)))
        ami_dat <- rbind(ami_dat, data.frame(i=clust_len[j],j=clust_len[i], 
                                             ami=AMI(mg$grp.x, mg$grp.y),
                                             ari=ARI(mg$grp.x,mg$grp.y),
                                             nmi = NMI(mg$grp.x,mg$grp.y)))
    }
}

In [741]:
options(repr.plot.width = 5, repr.plot.height = 5)

library(Hmisc)
ami_dat <- data.table(ami_dat)

res <- ami_dat[, as.list(smean.cl.boot(ami,conf.int = .66)), by=i]
pl <- ggplot(res, aes(factor(i), Mean, ymin=Lower, ymax=Upper)) +
geom_pointrange() + ylab("Adjusted Mutual Information") + 
xlab("Setting for k in VSP")
ggsave("plots/ami_whortswhat.pdf",pl,h=5,w=5)

In [47]:
kval = 20
data <- bernie_reslist[[kval]]$ymg[grp %in% which(table(bernie_reslist[[kval]]$zmg$grp) > 1000)]

In [34]:
library(data.table)
library(stringr)
kval = 20
data <- bernie_reslist[[kval]]$ymg[grp %in% which(table(bernie_reslist[[kval]]$zmg$grp) > 1000)]
data[, date_val := as.Date(as_datetime(created_at))]
full_dat <- data.table()
for(grp_num in sort(unique(data$grp))){
    grp_str <- paste0("y",str_pad(grp_num, width = 2, "left","0"))
    dat <- data[grp == grp_num][order(-get(grp_str))][1:20][,.(rt_id, text,date_val)]
    dat$url <- paste0("https://twitter.com/_kenny_joseph/status/", dat$rt_id)
    dat$cluster_number <- grp_num
    full_dat <- rbind(full_dat,dat)
}



In [None]:
# Qualitative analysis was then performed again
write.csv(full_dat[sample(1:nrow(full_dat)), ], "data/for_qualitative_coding.csv", row.names=F)