In [1]:
# YoungSoroka.2012.Rdata を評価データとして使うための前処理

In [2]:
library(quanteda)
library(tidyverse)
library(lubridate)

Package version: 4.3.1
Unicode version: 14.0
ICU version: 70.1

Parallel computing: 8 of 8 threads used.

See https://quanteda.io for tutorials and examples.

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.6
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [3]:
load("./YoungSoroka.2012.Rdata")

In [4]:
ys <- YS %>% 
  as_tibble() %>% 
  filter(topic == 'Economy') %>%
  #filter(topic == 'Foreign') %>%
  mutate(date = ymd(str_c(year, month, day))) %>% 
  mutate(is_pos_1 = if_else(code_1 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_1 = if_else(code_1 == 'Negative', TRUE, FALSE)) %>% 
  mutate(is_pos_2 = if_else(code_2 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_2 = if_else(code_2 == 'Negative', TRUE, FALSE)) %>% 
  mutate(is_pos_3 = if_else(code_3 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_3 = if_else(code_3 == 'Negative', TRUE, FALSE)) %>% 
  mutate(n_pos = is_pos_1 + is_pos_2 + is_pos_3) %>% 
  mutate(n_neg = is_neg_1 + is_neg_2 + is_neg_3) %>%
  mutate(scale = if_else(n_pos == 3, 5, if_else(n_pos ==2, 4, if_else(n_neg == 2, 2, if_else(n_neg == 3, 1, 3)))))

corpus <- corpus(ys, text_field = 'body')

toks <- corpus %>% 
  #  corpus_reshape("sentences") %>% 
  tokens(remove_punct = TRUE, remove_symbols = TRUE, 
         remove_numbers = TRUE, remove_url = TRUE) %>% 
  tokens_remove(stopwords("en", source = "marimo")) %>% 
  tokens_remove(min_nchar = 2) %>% 
  tokens_remove(c("*-time", "*-timeUpdated", "GMT", "BST", "*.com")) %>% 
  tokens_tolower()

mx <- toks %>% 
  dfm()
#  dfm_remove(pattern = "") %>% 
#  dfm_trim(min_termfreq = 3) %>% 
#  dfm_trim(min_docfreq = 3)
mx

D = ndoc(mx)
V = nfeat(mx)

df <- convert(mx, to = "data.frame")
df <- as_tibble(rownames_to_column(df, "Id"))
df

cols <- colnames(df)
vocab <- cols[3:length(cols)]


Document-feature matrix of: 450 documents, 13,140 features (98.71% sparse) and 19 docvars.
       features
docs    rate americans bought new single family homes dropped percent
  text1    6         1      1   6      2      2     2       1       8
  text2    1         0      0   2      0      0     0       0       0
  text3    0         0      0   1      0      0     0       0       0
  text4    0         0      0   4      0      0     0       0       3
  text5    0         0      0   1      0      0     0       0       2
  text6    1         0      0   3      0      0     0       0       0
       features
docs    government
  text1          1
  text2          1
  text3          0
  text4          0
  text5          1
  text6          0
[ reached max_ndoc ... 444 more documents, reached max_nfeat ... 13,130 more features ]

Id,doc_id,rate,americans,bought,new,single,family,homes,dropped,⋯,del,diller's,iac,interactivecorp,shrugging,recounting,xnational,cochrane,xkey,skirts
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,text1,6,1,1,6,2,2,2,1,⋯,0,0,0,0,0,0,0,0,0,0
2,text2,1,0,0,2,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,text3,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,text4,0,0,0,4,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,text5,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,text6,1,0,0,3,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
7,text7,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
8,text8,0,0,0,6,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
9,text9,1,0,0,2,0,0,0,2,⋯,0,0,0,0,0,0,0,0,0,0
10,text10,0,0,0,11,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [8]:
Word2Id <- c(1:length(vocab))
names(Word2Id) <- vocab

w.1 <- data.frame()
w.2 <- data.frame()
for(n in 1:D) {
  v <- c()
  for (m in 1:V) {
    if (df[[n,m+2]] > 0) {
      for (i in 1:df[[n,m+2]]) {
        v <- c(v, m)        
      }
    }
  }
  w.1 <- rbind(w.1, data.frame(Doc=n, Word=v))
  w.2 <- rbind(w.2, data.frame(Doc=n, table(Word=v)))
  if (n %% 100 == 0) {
    cat(paste('\r', n))
  }
}

M <- D
W <- w.2$Word

w.1$Word <- as.integer(as.character(w.1$Word))
w.2$Word <- as.integer(as.character(w.2$Word))
N.1 <- nrow(w.1)  # total word instances
N.2 <- nrow(w.2)  # total word instances
offset.1 <- t(sapply(1:M, function(m){ range(which(m==w.1$Doc)) }))
offset.2 <- t(sapply(1:M, function(m){ range(which(m==w.2$Doc)) }))

bow <- matrix(0, M, V)  # data type 3: bag-of-words
for(n in 1:N.2) {
  bow[w.2$Doc[n], w.2$Word[n]] <- w.2$Freq[n]
}

freq <- featfreq(mx)
named_p_v <- freq/sum(freq)
p_v = c()
for (i in 1:length(named_p_v)) {
  p_v <- c(p_v, named_p_v[[i]])
}

all(names(freq) == vocab)

save.image("./YS.RData")

 400