In [52]:
import string
import pandas as pd
import numpy as np
from polyglot.detect import Detector
import re
import warnings
warnings.filterwarnings(action = "ignore")
import icu
import nltk

## Punctuation, URL and text removal/replacements but without stopwords removal

In [89]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('averaged_perceptron_tagger')

#get english stopwords
en_stopwords = set(stopwords.words('english'))
# add "zapier" and "..." to stopwords
en_stopwords.add("zapier")
en_stopwords.add("…")

stop_words_to_put_back = ["after", "any", "both", "between", "do","once","these","off",
                          #"on", "to", "and", "with","you",
                          "through","what", "when", "while","which", "where"]

# Function for cleaning text by removing punctuation only
def clean_text(text):
    # remove all urls as identified by "http" and "www". 
    # "\S+" removes all non-whitespace characters as part of the url until the end of url as defined by whitespace
    # https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet/24399874
    regex_url = re.compile(r"http\S+"+"|"
                           +"www\S+"+"|"
                          +"\S+\.com\S+")
    dirty_text = regex_url.sub(" ", str(text).lower())
    
    # Replace "i'm" with "i am"
    regex_im = re.compile(r"i'm ")
    dirty_text = regex_im.sub("i am ", str(dirty_text))
    
    # Replace "i'll" with "i will"
    regex_im = re.compile(r"i'll ")
    dirty_text = regex_im.sub("i will ", str(dirty_text))
    
    # Replace "let's" with "let us"
    regex_lets = re.compile(r"let's ")
    dirty_text = regex_lets.sub("let us ", str(dirty_text))
    
    # Replace "it's" with "it is"
    regex_its = re.compile(r"it's ")
    dirty_text = regex_its.sub("it is ", str(dirty_text))
    
    # Replace "there's" with "there is"
    regex_theres = re.compile(r"there's ")
    dirty_text = regex_theres.sub("there is ", str(dirty_text))
    
    # Replace "don't" with "do not"
    regex_dont = re.compile(r"don't ")
    dirty_text = regex_dont.sub("do not ", str(dirty_text))
    
    # Replace digits and digits with one instance of comma formatting
    regex_digits = re.compile(r"\d*[,]+\d*[+]?"+"|"+
                              "\d+")
    dirty_text = regex_digits.sub(" ", str(dirty_text))
    
    # Replace twitter tags
    regex_twittertags = re.compile(r"#\S+")
    dirty_text = regex_twittertags.sub(" ", str(dirty_text))
    
     # Replace twitter handles
    regex_twitterhandles = re.compile(r"@\S+")
    dirty_text = regex_twitterhandles.sub(" ", str(dirty_text))
    
    # remove punc except new lines "\n"
    regex_punct = re.compile('['+ re.escape(string.punctuation) + '\\r\\t]')
    dirty_text = regex_punct.sub(" ", str(dirty_text))
    
    # remove stopwords
    regex_stopwords = re.compile(r'\b(?:%s)\b' % '|'.join(en_stopwords))
    clean_text = regex_stopwords.sub(" ", str(dirty_text))
    
    tokens = word_tokenize(clean_text.lower())
    
    return tokens

## Unigram Frequency Distribution Dictionary

In [190]:
def freq_distribution(dat):

    # Count words in text. Preprocessing for filtering out text of word count = 1 so language detector can work
    dat["num_words"] = dat["text"].apply(lambda x: len(re.findall(r'\w+', str(x))))

    # Drop all rows where text is only 1 word
    dat = dat[dat["num_words"] >1].reset_index(drop = True)

    #function to remove non-ascii characters (otherwise it will throw error in langauge detection)
    def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

    dat["text"] = dat["text"].map(removeNonAscii)

    # Detect language
    from polyglot.detect import Detector
    dat["detector"] = dat["text"].apply(lambda x: Detector(x, quiet = True))

    import icu
    dat["language"] = dat["detector"].apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
    dat["confidence"] = dat["detector"].apply(lambda x: x.language.confidence)

    # Filter for only english tweets
    dat = dat[dat["language"] == "English"].reset_index(drop = True)

    # Combine all list of tokens into one list, but segment between with new line escape "\n" to separate out tweet 
    # (otherwise introduce false signal between last word of each tweet and first word of the next tweet for bigrams)
    list_str = list(dat["text"])
    list_str_tot = "\n".join(list_str)

    # Perform cleaning function
    clean_tokens = clean_text(list_str_tot)

    # Find FreqDist
    from nltk.probability import FreqDist

    fd = FreqDist(clean_tokens)
    
    df = pd.DataFrame(fd.items(), columns= ["Word", "Count"])
    
    total_word_count = sum(df["Count"])
    df["Prob"] = df["Count"].map(lambda x: x/total_word_count)

    # Sort then reset axis and create column
    df = df.sort_values(by = "Count", ascending = False).reset_index(drop = True).rename_axis("sorting_order").reset_index()
    
    return fd, df

## Altair plot for Freq dist


In [191]:
import altair as alt
import selenium

def freq_dist_plot(df, year = "2019", max_rows = 50):
    
    plot = alt.Chart(df[:max_rows]).mark_bar().encode(
                x = alt.X("Word:N",
                         sort=alt.EncodingSortField(field='sort_order', op="mean", order='ascending')),
                y = alt.Y("Count:Q"),
                tooltip = ["Word","Count"]
            ).properties(
                width = 1200,
                title = "Unigrams Frequency Distribution in "+year
            ).configure_axisX(
                labelAngle = -45
            )
    
    return plot

## WordCloud Generation

In [192]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def unigram_word_cloud(fd, min_filter = 50, background_color = "white"):
    
    # Takes in Frequency Dictionary
    
    wordcloud = WordCloud(background_color = background_color,
                         width = 3000,
                         height = 1500)
    
    # filtering
    fd = dict(fd.most_common(min_filter))
    
    # Generate plot from all words
    wordcloud.generate_from_frequencies(fd)

    # plt.figure(figsize = (20,10))
    # plt.imshow(wordcloud, interpolation="bilinear")
    # plt.axis("off")
    # plt.show()
    
    return wordcloud

# Read in all the data from all years

In [193]:
dat_2019 = pd.read_csv("./output_scrape/output_2019.csv")
dat_2018 = pd.read_csv("./output_scrape/output_2018.csv")
dat_2017 = pd.read_csv("./output_scrape/output_2017.csv")
dat_2016 = pd.read_csv("./output_scrape/output_2016.csv")

In [194]:
fd_2019, df_2019 = freq_distribution(dat_2019)
fd_2018, df_2018 = freq_distribution(dat_2018)
fd_2017, df_2017 = freq_distribution(dat_2017)
fd_2016, df_2016 = freq_distribution(dat_2016)



## Saving the Dataframes into Excel

In [196]:
list_year = ["2016", "2017", "2018", "2019"]
list_df= [df_2019, df_2018, df_2017, df_2016]

# Set min frequency for word counts to be saved in dataframe
min_freq = 5

with pd.ExcelWriter("./results/unigrams/compiled_unigrams.xlsx") as writer:
    for i in range(len(list_year)):

        list_df[i].query("Count > @min_freq").to_excel(writer, sheet_name = list_year[i])

# Saving WordCloud plots

In [203]:
list_year = ["2016", "2017", "2018", "2019"]
list_fd = [fd_2019, fd_2018, fd_2017, fd_2016]

min_filter = 50

for i in range(len(list_year)):
    word_cloud = unigram_word_cloud(list_fd[i], min_filter = min_filter)
    word_cloud.to_file("./results/unigrams/wordcloud_img/wordcloud_"+str(min_filter)+"words_"+list_year[i]+".png")