-
Notifications
You must be signed in to change notification settings - Fork 54
/
demo.R
153 lines (123 loc) · 5.42 KB
/
demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
## Demonstration of quanteda's capabilities
##
## Ken Benoit <kbenoit@lse.ac.uk>
## Paul Nulty <p.nulty@lse.ac.uk>
library("quanteda")
help(package = "quanteda")
## create a corpus from a text vector of UK immigration texts
summary(data_char_ukimmig2010)
str(data_char_ukimmig2010)
# create a corpus from immigration texts
immigCorpus <- corpus(data_char_ukimmig2010,
metacorpus = list(notes = "Created as part of a demo."))
docvars(immigCorpus) <- data.frame(party = names(data_char_ukimmig2010),
year = 2010)
summary(immigCorpus)
# explore using kwic
kwic(immigCorpus, "deport", window = 3)
kwic(immigCorpus, phrase("illegal immig*"), window = 3)
# extract a document-feature matrix
immigDfm <- dfm(corpus_subset(immigCorpus, party == "BNP"))
textplot_wordcloud(immigDfm)
immigDfm <- dfm(corpus_subset(immigCorpus, party == "BNP"),
remove = c(stopwords("english"), "will"),
remove_punct = TRUE)
textplot_wordcloud(immigDfm,
random.color = TRUE, rot.per = .25,
colors = sample(colors()[2:128], 5))
# change units to sentences
immigCorpusSent <- corpus_reshape(immigCorpus, to = "sentences")
summary(immigCorpusSent, 20)
## tokenize some texts
txt <- "#TextAnalysis is MY <3 4U @myhandle gr8 #stuff :-)"
tokens(txt, remove_punct = TRUE)
tokens(txt, remove_punct = TRUE, remove_twitter = FALSE)
tokens(txt, remove_punct = TRUE, remove_twitter = TRUE)
(toks <- tokens(char_tolower(txt), remove_punct = TRUE,
remove_twitter = TRUE))
# tokenize sentences
(sents <- tokens(data_char_ukimmig2010[3], what = "sentence"))
# tokenize characters
tokens(data_char_ukimmig2010[5], what = "character")[[1]][1:100]
## some descriptive statistics
## create a document-feature matrix from the inaugural corpus
summary(data_corpus_inaugural)
presDfm <- dfm(data_corpus_inaugural)
head(presDfm)
docnames(presDfm)
# concatenate by president name
presDfm <- dfm(data_corpus_inaugural, groups = "President", verbose = TRUE)
presDfm
docnames(presDfm)
presDfm <- dfm(data_corpus_inaugural, groups = c("President", "FirstName"), verbose = TRUE)
docnames(presDfm)
# need first to install quantedaData, using
# devtools::install_github("quanteda/quanteda.corpora")
## show some selection capabilities on Irish budget corpus
data(data_corpus_irishbudgets, package = "quanteda.corpora")
summary(data_corpus_irishbudgets, 10)
ieFinMin <- corpus_subset(data_corpus_irishbudgets,
number=="01" & debate == "BUDGET")
summary(ieFinMin)
dfmFM <- dfm(ieFinMin)
plot(2008:2012, textstat_lexdiv(dfmFM, "C")[["C"]],
xlab = "Year", ylab = "Herndan's C", type = "b",
main = "World's Crudest Lexical Diversity Plot")
# plot some readability statistics
data(data_corpus_sotu, package = "quanteda.corpora")
stat <- textstat_readability(data_corpus_sotu, "Flesch.Kincaid")
year <- lubridate::year(docvars(data_corpus_sotu, "Date"))
require(ggplot2)
partyColours <- c("blue", "blue", "black", "black", "red", "red")
p <- ggplot(data = cbind(stat, year, docvars(data_corpus_sotu)),
aes(x = year, y = Flesch.Kincaid)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
geom_smooth(alpha = 0.2, linetype = 1, color = "grey70", method = "loess", span = .34) +
xlab("") +
ylab("Flesch-Kincaid Readability") +
geom_point(aes(colour = party)) +
scale_colour_manual(values = partyColours) +
geom_line(aes(), alpha=0.3, size = 1) +
# ggtitle("Text Complexity in State of the Union Addresses") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
print(p)
## Presidential Inaugural Address Corpus
presDfm <- dfm(data_corpus_inaugural, remove = stopwords("english"))
# compute some document similarities
as.list(textstat_simil(presDfm, "1985-Reagan", margin = "documents"))
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), method = "cosine")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), method = "ejaccard")
# compute some term similarities
featsim <- textstat_simil(presDfm, c("fair", "health", "terror"), margin = "features",
method = "cosine")
lapply(as.list(featsim), head)
## mining collocations
# form ngrams
txt <- "Hey @kenbenoit #textasdata: The quick, brown fox jumped over the lazy dog!"
(toks1 <- tokens(char_tolower(txt), remove_punct = TRUE))
tokens(char_tolower(txt), remove_punct = TRUE, ngrams = 2)
tokens(char_tolower(txt), remove_punct = TRUE, ngrams = c(1,3))
# low-level options exist too
tokens_ngrams(toks1, c(1, 3, 5))
# form "skip-grams"
toks <- tokens("insurgents killed in ongoing fighting")
tokens_skipgrams(toks, n = 2, skip = 0:1, concatenator = " ")
tokens_skipgrams(toks, n = 2, skip = 0:2, concatenator = " ")
tokens_skipgrams(toks, n = 3, skip = 0:2, concatenator = " ")
# mine bigrams
collocs2 <-
tokens(data_corpus_inaugural) %>%
tokens_remove(stopwords("english"), padding = TRUE) %>%
tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE) %>%
tokens_tolower() %>%
textstat_collocations(size = 2)
head(collocs2, 20)
# mine trigrams
collocs3 <- tokens(data_corpus_inaugural) %>%
tokens_remove(stopwords("english"), padding = TRUE) %>%
tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE) %>%
textstat_collocations(size = 3)
head(collocs3, 20)