# Script to extract trending topics and Top 10 hashtags from Twitter API
This script will requests tweets from Twitter API for each of the 31 LGAs based on their localities/suburbs which has been stored in a csv file <b>filtered_locality.csv</b> together with their latitudes and longitudes. The script was written in a way in which requests to Twitter for each LGA will be made through a concatenation of all suburbs under a particular LGA (e.g. 'caulfield OR glenhuntly OR Murrumbeena OR Carnegie OR Bentleigh OR Gardenvale OR Elsternwick OR BrightonEast OR McKinnon OR Ormond') for Glen Eira City Council. 1000 tweets were requested within 10 miles distance of the latitudes and longitudes. Note that due to the restriction of Twitter Public API, only tweets tweeted in the last 7 days will be received.

After which, text cleaning was done to the tweets message to get the trending topics/keywords in each LGA based on their frequency and the top 10 hashtags in each LGA based on their frequency as well. The two resulting dataframes are the stored into csv files for further usage in Microsoft Power BI visualisation.

### Output Documents
1. <b>TRENDING.csv</b> - contains 100 trending topics/keywords for each of the 31 LGA, their frequency count and the corresponding LGA
2. <b>HASHTAGS.csv</b> - contains top 10 hashtags for each of the 31 LGA, their frequency count and the corresponding LGA

### Note
- Manual input required in <b>Setup Twitter Connection</b> with the `consumer_key`, `consumer_secret`, `access_token` and `access_secret` obtained from Twitter Developers Website with individual account
- Ensure that `filtered_locality.csv` is in the same directory as this script file in order to ensure the script can read in the data

### Setup Twitter Connection

In [1]:
# import necessary library for Twitter API, data manipulation and text cleaning
library("twitteR")
library("ROAuth")
library("dplyr")
library("tidytext")
library("stringr")

# Set up Twitter Connection
consumer_key <- ''                     ############### INPUT TWITTER DEVELOPER API TOKENS HERE ###############
consumer_secret<- ''
access_token <- ''
access_secret <- ''
setup_twitter_oauth(consumer_key ,consumer_secret,access_token ,access_secret)


Attaching package: 'dplyr'

The following objects are masked from 'package:twitteR':

    id, location

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



[1] "Using direct authentication"


### Section 1 - Extract tweets and populate trending topics/hashtags

In [2]:
# read in filtered_locality which contains the 31 LGAs and their locality/suburbs
locality = read.csv("./filtered_locality.csv", header = TRUE)

options(warn=-1)
# set flag for storage of tweets
flag1 = 0
# set flag for storage of top 10 topics
flag2 = 0
# set flag for storage of hashtags
flag3 = 0
# loop through the 31 LGA/Municipality Name
for (region in as.character(unique(locality$Municipality.Name))){
    # filter the locality dataframe to this region
    temp = locality %>% filter(Municipality.Name==region)
    # we need to split the suburbs search to 20 each time due to Twitter limitations
    if (nrow(temp)<=20){
        # initialize empty string
        search_string = ""
        # store all suburbs under this locality into a string for Twitter search
        for (suburb in temp[,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string == ""){
                search_string = paste(search_string, temp_string)
            }
            else{
                search_string = paste(search_string, temp_string, sep = ' OR ')  
            }
        }
        # store latitude, longitude and set distance to search to 10 miles
        geocode = paste(temp$latitude[2], temp$longitude[2], '10mi', sep = ",")
        # request data from Twitter API using suburb/locality name, geocode above, 1000 results and language english
        tweets <- searchTwitter(search_string, n=1000, geocode=geocode, lang = 'en')
        # convert tweets to df
        df <- twListToDF(tweets)
    }
    # if there's more than 20 but less than 40 suburbs
    else if ((nrow(temp)>20) && (nrow(temp)<=40)){
        search_string_1 = ""
        search_string_2 = ""
        for (suburb in temp[1:20,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_1 == ""){
                search_string_1 = paste(search_string_1, temp_string)
            }
            else{
                search_string_1 = paste(search_string_1, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[21:nrow(temp),1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_2 == ""){
                search_string_2 = paste(search_string_2, temp_string)
            }
            else{
                search_string_2 = paste(search_string_2, temp_string, sep = ' OR ')  
            }
        }
        # store latitude, longitude and set distance to search to 10 miles
        geocode = paste(temp$latitude[2], temp$longitude[2], '10mi', sep = ",")
        # request data from Twitter API using suburb/locality name, geocode above, 1000 results and language english
        tweets1 <- searchTwitter(search_string_1, n=1000, geocode=geocode, lang = 'en')
        df <- twListToDF(tweets1)
        tweets2 <- searchTwitter(search_string_2, n=1000, geocode=geocode, lang = 'en')
        if (length(tweets2)!=0){
            df <- rbind(df, twListToDF(tweets2))
            }
    }
    # if there's more than 40 but less than 60 suburbs
    else if ((nrow(temp)>40) && (nrow(temp)<=60)){
        search_string_1 = ""
        search_string_2 = ""
        search_string_3 = ""
        for (suburb in temp[1:20,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_1 == ""){
                search_string_1 = paste(search_string_1, temp_string)
            }
            else{
                search_string_1 = paste(search_string_1, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[21:40,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_2 == ""){
                search_string_2 = paste(search_string_2, temp_string)
            }
            else{
                search_string_2 = paste(search_string_2, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[41:nrow(temp),1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_3 == ""){
                search_string_3 = paste(search_string_3, temp_string)
            }
            else{
                search_string_3 = paste(search_string_3, temp_string, sep = ' OR ')  
            }
        }
        # store latitude, longitude and set distance to search to 10 miles
        geocode = paste(temp$latitude[2], temp$longitude[2], '10mi', sep = ",")
        # request data from Twitter API using suburb/locality name, geocode above, 1000 results and language english
        tweets1 <- searchTwitter(search_string_1, n=1000, geocode=geocode, lang = 'en')
        tweets2 <- searchTwitter(search_string_2, n=1000, geocode=geocode, lang = 'en')
        tweets3 <- searchTwitter(search_string_3, n=1000, geocode=geocode, lang = 'en')
        # convert tweets to df
        df <- twListToDF(tweets1)
        if (length(tweets2)!=0){
            df <- rbind(df, twListToDF(tweets2))
            }
        if (length(tweets3)!=0){
            df <- rbind(df, twListToDF(tweets3))
            }
    }
    # if there are more than 60 suburbs
    else if ((nrow(temp)>60)){
        search_string_1 = ""
        search_string_2 = ""
        search_string_3 = ""
        search_string_4 = ""
        for (suburb in temp[1:20,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_1 == ""){
                search_string_1 = paste(search_string_1, temp_string)
            }
            else{
                search_string_1 = paste(search_string_1, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[21:40,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_2 == ""){
                search_string_2 = paste(search_string_2, temp_string)
            }
            else{
                search_string_2 = paste(search_string_2, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[41:60,1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_3 == ""){
                search_string_3 = paste(search_string_3, temp_string)
            }
            else{
                search_string_3 = paste(search_string_3, temp_string, sep = ' OR ')  
            }
        }
        for (suburb in temp[61:nrow(temp),1]){
            temp_string = gsub(" ", "", suburb)
            if (search_string_4 == ""){
                search_string_4 = paste(search_string_4, temp_string)
            }
            else{
                search_string_4 = paste(search_string_4, temp_string, sep = ' OR ')  
            }
        }
        # store latitude, longitude and set distance to search to 10 miles
        geocode = paste(temp$latitude[2], temp$longitude[2], '10mi', sep = ",")
        # request data from Twitter API using suburb/locality name, geocode above, 1000 results and language english
        tweets1 <- searchTwitter(search_string_1, n=1000, geocode=geocode, lang = 'en')
        tweets2 <- searchTwitter(search_string_2, n=1000, geocode=geocode, lang = 'en')
        tweets3 <- searchTwitter(search_string_3, n=1000, geocode=geocode, lang = 'en')
        tweets4 <- searchTwitter(search_string_4, n=1000, geocode=geocode, lang = 'en')
        # convert tweets to df
        df <- twListToDF(tweets1)
        df <- rbind(df, twListToDF(tweets2))
        df <- rbind(df, twListToDF(tweets3))
        if (length(tweets4)!=0){
            df <- rbind(df, twListToDF(tweets4))
            }
    }
    # remove duplicates based on id (i.e. column 8)
    df = df[!duplicated(df[,8]),]
    # initialize empty vector to store hashtags
    hashvec = vector()
    # extract all hashtags in tweets text using regex
    tags_list=str_extract_all(df$text,"#[a-zA-Z0-9]{1,}")
    # loop through the extracted hashtags and add to hashvec
    for (i in 1:length(df$text)){
        if (length(tags_list[[i]]!=0)){
            for (j in 1:length(tags_list[[i]])){
            hashvec <- c(hashvec, tags_list[[i]][j])
            }
        }
    }
    # convert all hashtags to lowercase
    hashvec <- tolower(hashvec)
    # if this is the first LGA, create a hashtags dataframe
    if (flag3==0){
        HASHTAGS = as.data.frame(hashvec)
        # get the count
        HASHTAGS = HASHTAGS %>%dplyr::count(hashvec, sort = TRUE)
        # keep only the top 10 hashtags
        HASHTAGS <- HASHTAGS[1:10,]
        # store the LGA name
        HASHTAGS['LGA'] = region
        flag3 <- 1
    }
    # if not, create a temp_hash and rbind to hashtags dataframe
    else{
        temp_hash = as.data.frame(hashvec)
        # get the count
        temp_hash = temp_hash %>%dplyr::count(hashvec, sort = TRUE)
        # keep only the top 10 hashtags
        temp_hash <- temp_hash[1:10,]
        # store the LGA name
        temp_hash['LGA'] = region
        # perform rbind to hashtags
        HASHTAGS <- rbind(HASHTAGS, temp_hash)
    }
    
    # text cleaning
    df$text=as.character(df$text)
    df$text <- gsub("\\$", "", df$text) 
    df$text <- gsub("@\\w+", "", df$text)
    df$text <- gsub("[[:punct:]]","", df$text)
    df$text <- gsub("http\\w+", "", df$text)
    df$text <- gsub("[ |\t]{2,}", "", df$text)
    df$text <- gsub("^ ", "", df$text)
    df$text <- gsub(" $", "", df$text)
    df$text <- gsub("RT","",df$text)
    df$text <- gsub("href", "", df$text)
    # split the text into individual tokens
    text_df <- data_frame(text = df$text) %>%
      unnest_tokens(word, text)
    # if this is the first LGA/municipality name processed, do
    if (flag2==0){
        # remove stopwords, count the words frequency, sort it by descending order
        TRENDING <- text_df %>%
          anti_join(stop_words) %>%
          dplyr::count(word, sort = TRUE)
        # keep only the top 100 words
        TRENDING <- TRENDING[1:100,]
        # create column LGA to store the LGA/municipality name
        TRENDING['LGA'] <- region
        flag2 <- 1
    }
    # rbind the dataframe to df if this is the second locality/suburb onwards
    else{
        # remove stopwords, count the words frequency, sort it by descending order
        temp_output <- text_df %>%
          anti_join(stop_words) %>%
          dplyr::count(word, sort = TRUE)
        # keep only the top 100 words
        temp_output <- temp_output[1:100,]
        # create column LGA to store the LGA/municipality name
        temp_output['LGA'] <- region
        # rbind temp_output to output
        TRENDING <- rbind(TRENDING, temp_output)
    }
}
print("Done!")

Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"


[1] "Done!"


### Section 2 - Output files

In [3]:
# output the 2 dataframes into .csv files
write.csv(TRENDING, './TRENDING.csv',row.names = FALSE)
write.csv(HASHTAGS, './HASHTAGS.csv',row.names = FALSE)

# ------------------------------------- END OF SCRIPT --------------------------------------#