In [1]:
%load_ext rpy2.ipython

In [2]:
# YoungSoroka.2012.Rdata を評価データとして使うための前処理

In [3]:
%%R
library(quanteda)
library(tidyverse)
library(lubridate)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Package version: 4.3.1
Unicode version: 14.0
ICU version: 70.1
Parallel computing: 8 of 8 threads used.
See https://quanteda.io for tutorials and examples.


In [4]:
%R load("./YoungSoroka.2012.Rdata")

array(['YS'], dtype='<U2')

In [5]:
%%R
ys <- YS %>% 
  as_tibble() %>% 
  #filter(topic == 'Economy') %>%
  #filter(topic == 'Foreign') %>%
  mutate(date = ymd(str_c(year, month, day))) %>% 
  mutate(is_pos_1 = if_else(code_1 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_1 = if_else(code_1 == 'Negative', TRUE, FALSE)) %>% 
  mutate(is_pos_2 = if_else(code_2 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_2 = if_else(code_2 == 'Negative', TRUE, FALSE)) %>% 
  mutate(is_pos_3 = if_else(code_3 == 'Positive', TRUE, FALSE)) %>% 
  mutate(is_neg_3 = if_else(code_3 == 'Negative', TRUE, FALSE)) %>% 
  mutate(n_pos = is_pos_1 + is_pos_2 + is_pos_3) %>% 
  mutate(n_neg = is_neg_1 + is_neg_2 + is_neg_3) %>%
  mutate(scale = if_else(n_pos == 3, 5, if_else(n_pos ==2, 4, if_else(n_neg == 2, 2, if_else(n_neg == 3, 1, 3)))))

corpus <- corpus(ys, text_field = 'body')

toks <- corpus %>% 
  #  corpus_reshape("sentences") %>% 
  tokens(remove_punct = TRUE, remove_symbols = TRUE, 
         remove_numbers = TRUE, remove_url = TRUE) %>% 
  tokens_remove(stopwords("en", source = "marimo")) %>% 
  tokens_remove(min_nchar = 2) %>% 
  tokens_remove(c("*-time", "*-timeUpdated", "GMT", "BST", "*.com")) %>% 
  tokens_tolower()

mx <- toks %>% 
  dfm()
#  dfm_remove(pattern = "") %>% 
#  dfm_trim(min_termfreq = 3) %>% 
#  dfm_trim(min_docfreq = 3)
mx

D = ndoc(mx)
V = nfeat(mx)

df <- convert(mx, to = "data.frame")
df <- as_tibble(rownames_to_column(df, "Id"))
df

cols <- colnames(df)
vocab <- cols[3:length(cols)]

In [6]:
%%R
df <- df %>%
  add_column(ys_topic = ys$topic, .after = "doc_id") %>%
  add_column(ys_scale = ys$scale, .after = "ys_topic")

In [7]:
%R -o toks,vocab,df

In [8]:
r_toks = [toks[k] for k in toks.keys()]
r_vocab = list(vocab)

In [9]:
df

Unnamed: 0,Id,doc_id,ys_topic,ys_scale,rate,americans,bought,new,single,family,...,yiddish,peretz,ela,chaim,cunzer,uncle's,masterwork,handwritten,thesis,dissertation
1,1,text1,Economy,2.0,6.0,1.0,1.0,6.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,text2,Economy,3.0,1.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,text3,Economy,3.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,text4,Economy,3.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,text5,Economy,2.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,896,text896,Foreign,1.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
897,897,text897,Foreign,2.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
898,898,text898,Foreign,3.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
899,899,text899,Foreign,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
len(r_toks), r_toks[0], r_vocab[:10]

(900,
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,  4, 17, 30, 31, 32,
         1,  9, 33, 34, 35,  1,  9, 33, 36, 37, 35, 14, 29, 38, 39, 30, 40,
        41, 42, 43, 30, 44, 23, 45, 46, 30, 47,  4,  5,  6, 17, 48, 46, 30,
        47, 49, 50,  9,  9,  9, 39,  4, 17, 30, 51, 29, 37, 52, 53, 29, 54,
        55,  9, 56, 57, 30, 51,  1,  4,  7, 55, 58, 59, 60, 61, 62, 14, 17,
        30, 63, 64, 35,  1, 13, 14, 65,  9, 30, 66, 13, 14, 67, 68, 69, 70,
        71, 17, 30, 72, 73, 74, 75, 23, 76, 30, 73, 74, 49, 77, 35,  1, 30,
        75, 78, 79,  4, 80, 81, 82], dtype=int32),
 ['rate',
  'americans',
  'bought',
  'new',
  'single',
  'family',
  'homes',
  'dropped',
  'percent',
  'government'])

In [11]:
import pandas as pd
df.to_csv('YS.2012.csv', index=False)