# Script to extract events tweets and their sentiments from Twitter API

This script will first load in <b>eventbritedatapastevents.csv</b> which contains all the past 28 events in the 31 LGAs. Based on this list, requests will then be made to Twitter API to get the tweets relevant to the events based on the event name and/or description. Note that this has to be done manually as different event has their own names/descriptions format. 1000 tweets were requested within 400 miles distance of the latitude and longitude of Victoria as event participants can come from anywhere in Victoria, not necessarily just in the locality the event was held. Note that due to the restriction of Twitter Public API, only tweets tweeted in the last 7 days will be received.

Afterwhich, text cleaning will be done to the tweets message and sentiment analysis will then be carried out using `get_sentiments('nrc')` which will generate the sentiments of each token/word in the tweets message to one of the 10 sentiment categories mainly, <b>anger, anticipation, disgust, fear, joy, negative, positive, sadness, surprise, trust</b>. Once done, this script will output the tweets message together with their sentiments, and another dataframe containing the percentages of sentiments in each event for further usage in Microsoft Power BI visualisation.

### Output Documents
1. <b>TWEETS.csv</b> - contains relevant tweets messages on the past events in 31 LGAs and their sentiments
2. <b>SENTIMENTS.csv</b> - contains percentages of sentiments for each past event in 31 LGAs

### Note
- Manual input required in <b>Setup Twitter Connection</b> with the `consumer_key`, `consumer_secret`, `access_token` and `access_secret` obtained from Twitter Developers Website with individual account
- Before loading in `eventbritedatapastevents.csv`, ensure that manual input of `search_string` has been done in Microsoft Excel in order to search for the relevant tweets pertaining to each event.
- Ensure that `eventbritedatapastevents.csv` is in the same directory as this script so the dataset can be loaded in accordingly

### Setup Twitter Connection

In [None]:
# import necessary library for Twitter API, data manipulation and text cleaning
library("twitteR")
library("ROAuth")
library("dplyr")
library("tidytext")
library("stringr")
library("tidyr")

# Set up Twitter Connection
consumer_key <- ''                               ############### INPUT TWITTER DEVELOPER API TOKENS HERE ###############
consumer_secret<- ''
access_token <- ''
access_secret <- ''
setup_twitter_oauth(consumer_key ,consumer_secret,access_token ,access_secret)

### Section 1 - Extract tweets and populate sentiments

In [None]:
# read in list of past 28 days events in the 31 LGAs
events = read.csv("./eventbritedatapastevents.csv", header = TRUE)

options(warn=-1)
# store the 10 sentiments in a vector first
all_sen_list = c('anger','anticipation','disgust','fear','joy','negative','positive','sadness','surprise','trust')
# set flag for ease of storage
flag = 0
# empty vector to store events that contain no tweets
no_tweets_events = c()

# loop through each of the past event
for (i in 1:nrow(events)){
    # request from Twitter API based on search_string, no of tweets 1000, lat and lon of Victoria with 400 miles distance
    tweets <- searchTwitter(as.character(events$search_string[i]), n=1000, geocode='-37.4713,144.7852,400mi',lang = 'en')
    # process only if there are relevant tweets for this event
    if (length(tweets)!=0){
            # convert to dataframe and do text cleaning on the tweets text
            df <- twListToDF(tweets)
            df$text=as.character(df$text)
            df$text <- gsub("\\$", "", df$text) 
            df$text <- gsub("@\\w+", "", df$text)
            df$text <- gsub("[[:punct:]]","", df$text)
            df$statusSource <- gsub("http\\w+", "", df$statusSource)
            df$text <- gsub("[ |\t]{2,}", "", df$text)
            df$text <- gsub("^ ", "", df$text)
            df$text <- gsub(" $", "", df$text)
            text_df <- data_frame(id=df$id, text = df$text) %>% unnest_tokens(word, text)
            # if this is the first event, create TWEETS and SENTIMENTS
            if (flag == 0){
                # populate the sentiment for each token/word
                TWEETS <- df %>% left_join(text_df %>% inner_join(get_sentiments("nrc")))
                # remove the word with NA sentiment
                TWEETS <- subset(TWEETS, sentiment!="NA")
                # add in event name and LGA
                TWEETS['event'] <- as.character(events$name[i])
                TWEETS['LGA'] <- as.character(events$LGA[i])
                
                # create the SENTIMENTS dataframe which contain the % of the sentiments based on all words/tokens
                SENTIMENTS <- TWEETS %>% count(sentiment, id)
                # set the count of each sentiment to 1 as one tweet can have words of similar sentiments 
                SENTIMENTS['n'] <- 1
                # spread the dataframe to sentiments as the column headers
                SENTIMENTS <- SENTIMENTS %>% count(sentiment) %>% spread(sentiment, n)
                # get the percentages by dividing by the number of tweets for this event
                SENTIMENTS <- (SENTIMENTS/length(unique(TWEETS$created)))*100
                # append a 0% to sentiments not present in this event based on the earlier defined all_sen_list
                for (each in all_sen_list){
                    if (!(each %in% colnames(SENTIMENTS))){
                        SENTIMENTS[each] <- 0
                    }
                }
                # add in event name, LGA, event date, culture category
                SENTIMENTS['event'] <- as.character(events$name[i])
                SENTIMENTS['LGA'] <- as.character(events$LGA[i])
                SENTIMENTS['start_date'] <- as.character(events$start_date[i])
                SENTIMENTS['category'] <- as.character(events$category[i])
                # change flag to 1 so that TWEETS and SENTIMENTS will not be created again
                flag = 1
                }
            # repeat the above but create TWEETS_temp and SENTIMENTS_temp, and rbind to TWEETS and SENTIMENTS afterwhich
            else{
                TWEETS_temp <- df %>% left_join(text_df %>% inner_join(get_sentiments("nrc")))
                TWEETS_temp <- subset(TWEETS_temp, sentiment!="NA")
                if (nrow(TWEETS_temp)!=0){
                    TWEETS_temp['event'] <- as.character(events$name[i])
                    TWEETS_temp['LGA'] <- as.character(events$LGA[i])
                    TWEETS <- rbind(TWEETS, TWEETS_temp)

                    SENTIMENTS_temp <- TWEETS_temp %>% count(sentiment, id)
                    SENTIMENTS_temp['n'] <- 1
                    SENTIMENTS_temp <- SENTIMENTS_temp %>% count(sentiment) %>% spread(sentiment, n)
                    SENTIMENTS_temp <- (SENTIMENTS_temp/length(unique(TWEETS_temp$created)))*100
                    for (each in all_sen_list){
                        if (!(each %in% colnames(SENTIMENTS_temp))){
                            SENTIMENTS_temp[each] <- 0
                        }
                    }
                    SENTIMENTS_temp['event'] <- as.character(events$name[i])
                    SENTIMENTS_temp['LGA'] <- as.character(events$LGA[i])
                    SENTIMENTS_temp['start_date'] <- as.character(events$start_date[i])
                    SENTIMENTS_temp['category'] <- as.character(events$category[i])
                    SENTIMENTS <- rbind(SENTIMENTS, SENTIMENTS_temp)
                    }
                # if this event has no relevant tweet, append the name to the no_tweets_events vector
                else{
                    no_tweets_events = c(no_tweets_events, as.character(events$name[i]))
                }
                }
            }
    # if this event has no relevant tweet, append the name to the no_tweets_events vector
    else{
        no_tweets_events = c(no_tweets_events, as.character(events$name[i]))
    }
}
print("Done!")

### Section 2 - Output files

In [None]:
# output the 2 dataframes into .csv files
write.csv(TWEETS, './TWEETS.csv',row.names = FALSE)
write.csv(SENTIMENTS, './SENTIMENTS.csv',row.names = FALSE)

# ------------------------------------- END OF SCRIPT --------------------------------------#