In [1]:
# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    irr,
    readr,
    scales,
    tidyr
)

Loading required package: pacman



In [2]:
# load manually tagged usernames
data.raw <- read_csv('./data/raw/manual_scored_twitter_usernames.csv')

[1mRows: [22m[34m703[39m [1mColumns: [22m[34m6[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (4): by, classification, id, username
[34mdttm[39m (2): created_at, updated_at

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
# throw out votes before 2022-03-05 since that was a practice round
# also filter out NA username as that was an accdident
data.truncated <- data.raw %>%
    mutate(
        created_at_date = as.Date(created_at)
    ) %>%
    filter(
        created_at_date > '2022-03-05' &
        !is.na(username)
    )

In [4]:
juan <- data.truncated %>% filter(by == 'Juan Pablo') %>% spread(by, username)
ellie <- data.truncated %>% filter(by == 'ellie') %>% spread(by, username)
elizabeth <- data.truncated %>% filter(by == 'Elizabeth') %>% spread(by, username)
data.merged <- merge(juan, ellie, by.x = 'Juan Pablo', by.y = 'ellie') %>%
    mutate(
        username = `Juan Pablo`,
        juan_pablo = classification.x,
        ellie = classification.y
    ) %>%
    select(
        username,
        juan_pablo,
        ellie
    )
data.merged <- merge(data.merged, elizabeth, by.x = 'username', by.y = 'Elizabeth') %>%
    mutate(
        elizabeth = classification
    ) %>%
    select(
        username,
        juan_pablo,
        ellie,
        elizabeth
    )

In [5]:
kappam.light(data.merged %>% select(-username))

 Light's Kappa for m Raters

 Subjects = 100 
   Raters = 3 
    Kappa = -0.0332 

        z = -0.0562 
  p-value = 0.955 

In [6]:
data.votes <- data.truncated %>%
    mutate(
        bot_vote = ifelse(classification == 'bot', 1, 0)
    ) %>%
    group_by(username) %>%
    summarise(
        bot_vote_count = sum(bot_vote),
        total_votes = n()
    )

In [7]:
data <- data.votes %>%
    mutate(
        think_is_bot = bot_vote_count >= total_votes / 2
    ) %>%
    arrange(desc(think_is_bot))

In [9]:
count_think_is_bot <- data %>% filter(think_is_bot) %>% nrow()
count_think_is_human <- data %>% filter(!think_is_bot) %>% nrow()
total_usernames <- data %>% nrow()

cat(paste('count think_is_bot:', count_think_is_bot, '\n'))
cat(paste('percent think_is_bot:', percent(count_think_is_bot / total_usernames), '\n'))
cat(paste('count think_is_human:', count_think_is_human, '\n'))
cat(paste('percent think_is_human:', percent(count_think_is_human / total_usernames), '\n'))

count think_is_bot: 16 
percent think_is_bot: 11% 
count think_is_human: 134 
percent think_is_human: 89% 


In [12]:
data %>% write_csv('./data/generated/coalesced_manual_scored_twitter_usernames.csv')