-
Notifications
You must be signed in to change notification settings - Fork 54
/
social_media_example.R
119 lines (89 loc) · 3.99 KB
/
social_media_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
## Code for examples from Day 10, Social Media
library(quanteda)
##
## Facebook
##
# Access token: https://developers.facebook.com/tools/explorer/
token <- YOUR_TOKEN
require(Rfacebook)
## Scraping most recent 200 posts from Obama and Trump FB pages
obama <- getPage('barackobama', token = token, n = 20)
trump <- getPage('DonaldTrump', token = token, n = 20)
# creating corpus object
fbdata <- rbind(obama, trump)
fbcorpus <- corpus(fbdata$message, docvars = fbdata[, -which(names(fbdata)=="message")])
summary(fbcorpus)
# viewing the dfm using word clouds
dfm(fbcorpus, remove = c(stopwords("english"), "ofa.bo", "http"), stem = TRUE, groups = "from_name") %>%
textplot_wordcloud(comparison = TRUE)
##
## Twitter
##
## data for authentication
## get your keys at apps.twitter.com
require(twitteR)
key <- "XXX"
cons_secret <- "XXX"
token <- "XXX"
access_secret <- "XXX"
# authorize the Twitter access
setup_twitter_oauth(key, cons_secret, token, access_secret)
## downloading 1,500 most recent tweets for an account
tw <- userTimeline("realDonaldTrump", n = 3200, includeRts = TRUE)
twDf <- twListToDF(tw)
twCorpus <- corpus(twDf)
head(texts(twCorpus))
table(twCorpus[["isRetweet"]], useNA = "ifany")
## creating corpus and dfm objects, and word cloud
twDfm <- dfm(twCorpus,
remove = c("amp", "rt", "https", "t.co", "will", "@realdonaldtrump", stopwords("english")))
textplot_wordcloud(twDfm, min.freq = 20, random.order = FALSE)
## analyze just hashtags
twDfmHT <- dfm(twCorpus, keep = c("#*"), toLower = FALSE)
textplot_wordcloud(twDfmHT, min.freq = 5, random.order = FALSE)
## analyze just usernames
twDfmUN <- dfm(twCorpus, keep = c("@*"))
twDfmUN <- dfm_remove(twDfmUN, "@realdonaldtrump")
textplot_wordcloud(twDfmUN, min.freq = 5, random.order = FALSE)
## analyze by candidate
twCruz <- userTimeline("tedcruz", n = 3200, includeRts = TRUE)
twCruzDf <- twListToDF(twCruz)
twCruzCorpus <- corpus(twCruzDf)
candDfm <- dfm(twCorpus + twCruzCorpus, groups = "screenName",
remove = c("amp", "rt", "https", "t.co", "will", "@tedcruz",
"@realdonaldtrump", stopwords("english")))
textplot_wordcloud(candDfm, comparison = TRUE)
## analyze time of tweets
# from which device?
device <- docvars(twCorpus, "statusSource")
device <- ifelse(grepl("Android", device), "Android",
ifelse(grepl("iPhone", device), "iPhone",
ifelse(grepl("Web Client", device), "Web Client", NA)))
table(device, useNA = "ifany")
trumpDf <- data.frame(device, timeofDay = trumptimesDecimal)
par(mfrow = c(2,2))
# time of day - remove the day
time.ct <- as.POSIXct(twCorpus[["created"]]) -
as.POSIXct(paste(substring(as.character(twCorpus[["created"]]), 1, 10), "00:00:00"))
time.hour <- as.POSIXlt(time.ct)$hour + as.POSIXlt(time.ct)$min/60
# plot
par(mfrow = c(2,2))
hist(subset(time.hour, device == "Android"), breaks = 48, ylim = c(0,100),
col = "grey50", xlab = "24-hr time format", main = "Android", xaxt = "n")
axis(1, at = 0:24, cex = .7, labels = paste(0:24, "00", sep = ":"), las = 2)
hist(subset(time.hour, device == "iPhone"), breaks = 48, ylim = c(0,100),
col = "grey60", xlab = "24-hr time format", main = "iPhone", xaxt = "n")
axis(1, at = 0:24, cex = .7, labels = paste(0:24, "00", sep = ":"), las = 2)
hist(subset(time.hour, device == "Web Client"), breaks = 48, ylim = c(0,100),
col = "grey70", xlab = "24-hr time format", main = "Web Client", xaxt = "n")
axis(1, at = 0:24, cex = .7, labels = paste(0:24, "00", sep = ":"), las = 2)
###
### searching using REST API
###
searchTweets <- searchTwitter("brexit", n = 1000, since = "2016-01-01")
searchTweetsDf <- twListToDF(searchTweets)
searchTweetsCorpus <- corpus(searchTweetsDf$text, docvars = searchTweetsDf[, -which(names(searchTweetsDf)=="text")])
searchTweetsDfm <- dfm(searchTweetsCorpus, verbose = FALSE,
remove = c("amp", "rt", "https", "t.co", "will", stopwords("english")))
topfeatures(searchTweetsDfm, 30)
topfeatures(dfm_select(searchTweetsDfm, "#*"), 30)