Permalink
Browse files

initial import and tweaks from version from talk

  • Loading branch information...
0 parents commit f36f71a6217d04fab302ecc8b86ad95886f33f3d Jeffrey Breen committed Jul 11, 2011
@@ -0,0 +1,9 @@
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/0_start.R")
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/load.R")
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/code.R")
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/code.R")
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/code.R")
+?flush
+??flush
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/code.R")
+source("/Users/jbreen/Documents/workspace-cambridge/talk-201106-twitter-mining/R/code.R")
@@ -0,0 +1,20 @@
+# assume's we started R in the project's home directory
+# or used setwd() to get there:
+
+projectDir = getwd()
+
+codeDir = file.path(projectDir, 'R')
+dataDir = file.path(projectDir, 'data')
+outputDir = file.path(projectDir, 'output')
+
+VERBOSE=TRUE
+
+if (VERBOSE)
+ print("Loading libraries and functions for project")
+
+library(twitteR)
+library(plyr)
+library(ggplot2)
+
+# load our score.sentiment() function:
+source( file.path(codeDir, 'sentiment.R') )
@@ -0,0 +1,34 @@
+
+if (!file.exists(file.path(dataDir, 'american.tweets.RData' )) )
+{
+ stop("Tweets not found on disk -- source('R/scrape.R') to scrape Twitter first")
+
+} else {
+
+ if (VERBOSE)
+ print("Loading tweets from disk:")
+
+ print( load( file.path(dataDir, 'american.tweets.RData' ) ) )
+ print( load( file.path(dataDir, 'delta.tweets.RData' ) ) )
+ print( load( file.path(dataDir, 'jetblue.tweets.RData' ) ) )
+ print( load( file.path(dataDir, 'southwest.tweets.RData' ) ) )
+ print( load( file.path(dataDir, 'united.tweets.RData' ) ) )
+ print( load( file.path(dataDir, 'us.tweets.RData' ) ) )
+}
+
+
+if (VERBOSE)
+ print("Loading Hu & Liu opinion lexicon")
+
+hu.liu.pos = scan(file.path(dataDir, 'opinion-lexicon-English', 'positive-words.txt'), what='character', comment.char=';')
+hu.liu.neg = scan(file.path(dataDir, 'opinion-lexicon-English', 'negative-words.txt'), what='character', comment.char=';')
+
+# add a few twitter and industry favorites
+pos.words = c(hu.liu.pos, 'upgrade')
+neg.words = c(hu.liu.neg, 'wtf', 'wait', 'waiting', 'epicfail', 'mechanical')
+
+
+if (VERBOSE)
+ print("Loading ACSI airline scores")
+
+print( load( file.path(dataDir, 'acsi.df.RData')) )
@@ -0,0 +1,72 @@
+
+if (VERBOSE)
+{
+ print("Extracting text from tweets & calculating sentiment scores")
+ flush.console()
+}
+
+american.text = laply(american.tweets, function(t) t$getText() )
+delta.text = laply(delta.tweets, function(t) t$getText() )
+jetblue.text = laply(jetblue.tweets, function(t) t$getText() )
+southwest.text = laply(southwest.tweets, function(t) t$getText() )
+united.text = laply(united.tweets, function(t) t$getText() )
+us.text = laply(us.tweets, function(t) t$getText() )
+
+american.scores = score.sentiment(american.text, pos.words, neg.words, .progress='text')
+delta.scores = score.sentiment(delta.text, pos.words, neg.words, .progress='text')
+jetblue.scores = score.sentiment(jetblue.text, pos.words, neg.words, .progress='text')
+southwest.scores = score.sentiment(southwest.text, pos.words, neg.words, .progress='text')
+united.scores = score.sentiment(united.text, pos.words, neg.words, .progress='text')
+us.scores = score.sentiment(us.text, pos.words, neg.words, .progress='text')
+
+american.scores$airline = 'American'
+american.scores$code = 'AA'
+delta.scores$airline = 'Delta'
+delta.scores$code = 'DL'
+jetblue.scores$airline = 'JetBlue'
+jetblue.scores$code = 'B6'
+southwest.scores$airline = 'Southwest'
+southwest.scores$code = 'WN'
+united.scores$airline = 'United'
+united.scores$code = 'UA'
+us.scores$airline = 'US Airways'
+us.scores$code = 'US'
+
+all.scores = rbind( american.scores, delta.scores, jetblue.scores,
+ southwest.scores, united.scores, us.scores )
+
+if (VERBOSE)
+ print("Plotting score distributions")
+
+g.hist = ggplot(data=all.scores) + # ggplot works on data.frames, always
+ geom_bar(mapping=aes(x=score, fill=airline), binwidth=1) +
+ facet_grid(airline~.) + # make a separate plot for each airline
+ theme_bw() + scale_fill_brewer() # plain display, nice colors
+
+print(g.hist)
+ggsave(file.path(outputDir, 'score_historam.pdf'), g.hist, width=6, height=8)
+
+
+if (VERBOSE)
+ print("Comparing Twitter & ACSI data")
+
+all.scores$very.pos = as.numeric( all.scores$score >= 2 )
+all.scores$very.neg = as.numeric( all.scores$score <= -2 )
+
+twitter.df = ddply(all.scores, c('airline', 'code'), summarise, pos.count=sum( very.pos ), neg.count=sum( very.neg ) )
+
+twitter.df$all.count = twitter.df$pos.count + twitter.df$neg.count
+
+twitter.df$score = round( 100 * twitter.df$pos.count / twitter.df$all.count )
+
+compare.df = merge(twitter.df, acsi.df, by='code', suffixes=c('.twitter', '.acsi'))
+# compare.df = subset(compare.df, all.count > 100)
+
+g.fit = ggplot( compare.df ) +
+ geom_point(aes(x=score.twitter, y=score.acsi, color=airline.twitter), size=5) +
+ geom_smooth(aes(x=score.twitter, y=score.acsi, group=1), se=F, method="lm") +
+ theme_bw() +
+ opts(legend.position=c(0.2, 0.85))
+
+print(g.fit)
+ggsave(file.path(outputDir, 'twitter_acsi_comparison.pdf'), g.fit, width=7, height=7)
@@ -0,0 +1,45 @@
+if (VERBOSE)
+ print("Searching Twitter for airline tweets and saving to disk")
+
+require(twitteR)
+
+american.tweets = searchTwitter('@americanair', n=1500)
+save(american.tweets, file=file.path(dataDir, 'american.tweets.RData' ), ascii=T)
+
+delta.tweets = searchTwitter('@delta', n=1500)
+save(delta.tweets, file=file.path(dataDir, 'delta.tweets.RData' ), ascii=T)
+
+jetblue.tweets = searchTwitter('@jetblue', n=1500)
+save(jetblue.tweets, file=file.path(dataDir, 'jetblue.tweets.RData' ), ascii=T)
+
+southwest.tweets = searchTwitter('@southwestair', n=1500)
+save(southwest.tweets, file=file.path(dataDir, 'southwest.tweets.RData' ), ascii=T)
+
+united.tweets = searchTwitter('@united', n=1500)
+save(united.tweets, file=file.path(dataDir, 'united.tweets.RData' ), ascii=T)
+
+us.tweets = searchTwitter('@usairways', n=1500)
+save(us.tweets, file=file.path(dataDir, 'us.tweets.RData' ), ascii=T)
+
+
+if (VERBOSE)
+ print("Scraping ACSI airline scores and saving to disk")
+
+require(XML)
+
+# this assumes 2011 scores which just went live in June 2011
+acsi.url = 'http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Airlines'
+
+# we want the first table (which=1) on tha page, which has column headers (header=T)
+acsi.raw.df = readHTMLTable(acsi.url, header=T, which=1, stringsAsFactors=F)
+acsi.df = acsi.raw.df[,c(1,19)]
+
+# change the columnn names ("11" -> "score" since we're only looking at most recent)
+colnames(acsi.df) = c('airline', 'score')
+
+# add codes for later matching, and make sure score is treated as a number (not a string)
+acsi.df$code = c('WN', NA, NA, 'CO', 'AA', 'UA', 'US', 'DL', 'NW')
+acsi.df$score = as.numeric(acsi.df$score)
+
+save(acsi.raw.df, file=file.path(dataDir, 'acsi.raw.df.RData'), ascii=T)
+save(acsi.df, file=file.path(dataDir, 'acsi.df.RData'), ascii=T)
@@ -0,0 +1,87 @@
+
+score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
+{
+ require(plyr)
+ require(stringr)
+
+ # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
+ # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
+ scores = laply(sentences, function(sentence, pos.words, neg.words) {
+
+ # clean up sentences with R's regex-driven global substitute, gsub():
+ sentence = gsub('[[:punct:]]', '', sentence)
+ sentence = gsub('[[:cntrl:]]', '', sentence)
+ sentence = gsub('\\d+', '', sentence)
+ # and convert to lower case:
+ sentence = tolower(sentence)
+
+ # split into words. str_split is in the stringr package
+ word.list = str_split(sentence, '\\s+')
+ # sometimes a list() is one level of hierarchy too much
+ words = unlist(word.list)
+
+ # compare our words to the dictionaries of positive & negative terms
+ pos.matches = match(words, pos.words)
+ neg.matches = match(words, neg.words)
+
+ # match() returns the position of the matched term or NA
+ # we just want a TRUE/FALSE:
+ pos.matches = !is.na(pos.matches)
+ neg.matches = !is.na(neg.matches)
+
+ # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
+ score = sum(pos.matches) - sum(neg.matches)
+
+ return(score)
+ }, pos.words, neg.words, .progress=.progress )
+
+ scores.df = data.frame(score=scores, text=sentences)
+ return(scores.df)
+}
+
+# rule of thumb: if you're writing loops in R, you're probably doing something wrong
+score.sentiment.loopy = function(sentences, pos.words, neg.words)
+{
+ require(stringr)
+
+ scores = c()
+
+ for (i in 1:length(sentences))
+ {
+ sentence = sentences[i]
+
+ # clean up sentences with R's regex-driven global substitute, gsub():
+ sentence = gsub('[[:punct:]]', '', sentence)
+ sentence = gsub('[[:cntrl:]]', '', sentence)
+ sentence = gsub('\\d+', '', sentence)
+ # and convert to lower case:
+ sentence = tolower(sentence)
+
+ # split into words. str_split is in the stringr package
+ word.list = str_split(sentence, '\\s+')
+ # sometimes a list() is one level of hierarchy too much
+ words = unlist(word.list)
+
+ # reset our score
+ score = 0
+
+ for (w in 1:length(words) )
+ {
+ word = words[w]
+
+ # %in% is the binary operator for match() see ?`%in%` or ?match
+ if (word %in% pos.words)
+ score = score + 1
+
+ if (word %in% neg.words)
+ score = score - 1
+
+ }
+
+ scores = c(scores, score)
+ }
+
+ scores.df = data.frame(score=scores, text=sentences)
+
+ return(scores.df)
+}
@@ -0,0 +1,116 @@
+RDA2
+A
+2
+134400
+131840
+1026
+1
+9
+7
+acsi.df
+787
+3
+16
+9
+9
+9
+Southwest
+9
+10
+All\040Others
+9
+8
+Airlines
+9
+11
+Continental
+9
+8
+American
+9
+6
+United
+9
+10
+US\040Airways
+9
+5
+Delta
+9
+18
+Northwest\040Airlines
+14
+9
+81
+76
+65
+64
+63
+61
+61
+56
+NA
+16
+9
+9
+2
+WN
+9
+-1
+9
+-1
+9
+2
+CO
+9
+2
+AA
+9
+2
+UA
+9
+2
+US
+9
+2
+DL
+9
+2
+NW
+1026
+1
+9
+5
+names
+16
+3
+9
+7
+airline
+9
+5
+score
+9
+4
+code
+1026
+1
+9
+9
+row.names
+13
+2
+NA
+-9
+1026
+1
+9
+5
+class
+16
+1
+9
+10
+data.frame
+254
+254
Oops, something went wrong.

0 comments on commit f36f71a

Please sign in to comment.