-
Notifications
You must be signed in to change notification settings - Fork 6
/
basictextanalysis.R
136 lines (111 loc) · 5.07 KB
/
basictextanalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# ---
# title: "Basic Text Analysis Using R (workshop material)"
# author: "Justin Ho"
# last updated: "27/05/2019"
# ---
# Installing the packages
install.packages("quanteda")
install.packages("magrittr")
install.packages("dplyr")
install.packages("wordcloud")
install.packages("ggplot2")
# Loading the necessary packages
library(quanteda)
library(magrittr)
# Loading the documents
snp <- read.csv("SNP_corpus.csv", stringsAsFactors = FALSE) %>%
corpus(text_field = "post_message")
# The followings are not necessary steps, but it is always a good idea to view a portion of your data
snp[1:10] # print the first 10 documents
ndoc(snp) # Number of Documents
docnames(snp) # Document Names
nchar(snp[1:10]) # Number of character for the first 10 documents
ntoken(snp[1:10]) # Number of tokens for the first 10 documents
ntoken(snp[1:10], remove_punct = TRUE) # Number of tokens for the first 10 documents after removing punctuation
# Defining custom stopwords
customstopwords <- c("s", "http", "stopword")
# Creating DFM
snptokens <- tokens(snp, remove_punct = TRUE, remove_numbers = TRUE, verbose = TRUE, remove_url = TRUE)
snpdfm <- dfm(snptokens, stem = FALSE) %>%
dfm_trim(min_doc = 5, min_termfreq = 10)
# Inspecting the results
topfeatures(snpdfm, 30)
# A very slow way to plot a wordcloud, use with caution
# textplot_wordcloud(snpdfm)
# Plotting a histogram
library(ggplot2)
# Don't worry about the codes, just change "snpdfm" into the dfm you want to plot
snpfeatures <- topfeatures(snpdfm, 100) # Putting the top 100 words into a new object
topDf <- data.frame(list(term = names(snpfeatures), frequency = unname(snpfeatures))) # Create a data.frame for ggplot
topDf$term <- with(topDf, reorder(term, -frequency)) # Sort by reverse frequency order
ggplot(topDf) + geom_point(aes(x=term, y=frequency)) +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Doing it again, removing stop words this time!
snpdfm <- dfm(snptokens, remove = c(stopwords('english'), customstopwords), stem = FALSE) %>%
dfm_trim(min_doc = 5, min_termfreq = 10)
# Inspecting the results again
topfeatures(snpdfm, 30)
# textplot_wordcloud(snpdfm,
# min.freq = 1,
# colors = c("#1B9E77","#D95F02","#7570B3","#E7298A"))
# Plotting it again
snpfeatures <- topfeatures(snpdfm, 100)
topDf <- data.frame(list(term = names(snpfeatures), frequency = unname(snpfeatures))) # Create a data.frame for ggplot
topDf$term <- with(topDf, reorder(term, -frequency)) # Sort by reverse frequency order
ggplot(topDf) + geom_point(aes(x=term, y=frequency)) +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Wait? What happens with "shared"?
kwic(snp, "shared", 3)
# Keyword in Context
kwic(snp, "brexit", 4)
kwic(snp, "eu", 4)
# =================================== Keyness Analysis =======================================
# Loading the UKIP corpus
ukip <- read.csv("ukip_corpus.csv", stringsAsFactors = FALSE) %>%
corpus(text_field = "post_message")
cat(ukip[1:3])
ukiptokens <- tokens(ukip, remove_punct = TRUE, remove_numbers = TRUE, verbose = TRUE, remove_url = TRUE)
ukipdfm <- dfm(ukiptokens, remove = c(stopwords('english'), customstopwords)) %>%
dfm_trim(min_doc = 5, min_termfreq = 10)
topfeatures(ukipdfm)
# plotting it
ukipfeatures <- topfeatures(ukipdfm, 100)
topDf <- data.frame(list(term = names(ukipfeatures), frequency = unname(ukipfeatures))) # Create a data.frame for ggplot
topDf$term <- with(topDf, reorder(term, -frequency)) # Sort by reverse frequency order
ggplot(topDf) + geom_point(aes(x=term, y=frequency)) +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Estimating Keyness
kwds <- textstat_keyness(rbind(snpdfm, ukipdfm), target = seq_along(snptokens)) # Making SNP tokens as the target
head(kwds, 20)
tail(kwds, 20)
textplot_keyness(kwds)
# A user-defined function to plot comparison cloud
keyness_cloud <- function(x, a = "A", b = "B", acol = "#00C094", bcol = "#F8766D", w = 600, h = 600, maxword = 500, png = TRUE){
require(wordcloud)
require(dplyr)
set.seed(1024)
#Select all word with p-value <= 0.05 and then make a comparison wordcloud
kwdssig <- data.frame(term = x$feature, chi2 = x$chi2, p=x$p) %>%
filter(x$p <= 0.05) %>%
select(term, chi2)
row.names(kwdssig) <- kwdssig$term
kwdssig$a <- kwdssig$chi2
kwdssig$b <- kwdssig$chi2
kwdssig$b[kwdssig$b > 0] <- 0
kwdssig$a[kwdssig$a < 0] <- 0
kwdssig <- kwdssig[,-1:-2]
colnames(kwdssig) <- c(a, b)
if (png) {
png(paste0(deparse(substitute(x)), ".png"), width = w, height = h)
comparison.cloud(kwdssig, random.order=FALSE, colors = c(acol, bcol),scale=c(6,.6), title.size=3, max.words = maxword)
dev.off()
} else {
comparison.cloud(kwdssig, random.order=FALSE, colors = c(acol, bcol),scale=c(6,.6), title.size=3, max.words = maxword)
}
}
keyness_cloud(kwds, # Name of the keyness result object
a = "SNP", # Name of the target corpus
b = "UKIP", # Name of the reference corpus
acol = "goldenrod1", # Colour of the target corpus
bcol = "blueviolet", # Colour of the reference corpus
png = TRUE) # Save to png?