In [1]:
import string
import pandas as pd
import numpy as np
from polyglot.detect import Detector
import re
import warnings
warnings.filterwarnings(action = "ignore")
import icu
import nltk

## Punctuation, URL and text removal/replacements but without stopwords removal

In [2]:
import re
import string
from nltk.tokenize import word_tokenize

# Function for cleaning text by removing punctuation only
def clean_text(text):
    # remove all urls as identified by "http" and "www". 
    # "\S+" removes all non-whitespace characters as part of the url until the end of url as defined by whitespace
    # https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet/24399874
    regex_url = re.compile(r"http\S+"+"|"
                           +"www\S+"+"|"
                          +"\S+\.com\S+")
    dirty_text = regex_url.sub(" ", str(text).lower())
    
    # Replace "i'm" with "i am"
    regex_im = re.compile(r"i'm ")
    dirty_text = regex_im.sub("i am ", str(dirty_text))
    
    # Replace "i'll" with "i will"
    regex_im = re.compile(r"i'll ")
    dirty_text = regex_im.sub("i will ", str(dirty_text))
    
    # Replace "let's" with "let us"
    regex_lets = re.compile(r"let's ")
    dirty_text = regex_lets.sub("let us ", str(dirty_text))
    
    # Replace "it's" with "it is"
    regex_its = re.compile(r"it's ")
    dirty_text = regex_its.sub("it is ", str(dirty_text))
    
    # Replace "there's" with "there is"
    regex_theres = re.compile(r"there's ")
    dirty_text = regex_theres.sub("there is ", str(dirty_text))
    
    # Replace "don't" with "do not"
    regex_dont = re.compile(r"don't ")
    dirty_text = regex_dont.sub("do not ", str(dirty_text))
    
    # Replace digits and digits with one instance of comma formatting
    regex_digits = re.compile(r"\d*[,]+\d*[+]?"+"|"+
                              "\d+")
    dirty_text = regex_digits.sub(" ", str(dirty_text))
    
    # Replace twitter tags
    regex_twittertags = re.compile(r"#\S+")
    dirty_text = regex_twittertags.sub(" ", str(dirty_text))
    
     # Replace twitter handles
    regex_twitterhandles = re.compile(r"@\S+")
    dirty_text = regex_twitterhandles.sub(" ", str(dirty_text))
    
    # remove punc except new lines "\n"
    regex_punct = re.compile('['+ re.escape(string.punctuation) + '\\r\\t]')
    nopunct = regex_punct.sub(" ", str(dirty_text))
    
    tokens = word_tokenize(nopunct.lower())
    
    return tokens

## Text Cleaning Function

In [3]:
from nltk.corpus import stopwords
#nltk.download('averaged_perceptron_tagger')

#get english stopwords
en_stopwords = set(stopwords.words('english'))
# add "zapier" and "..." to stopwords
en_stopwords.add("zapier")
en_stopwords.add("…")

stop_words_to_put_back = ["after", "any", "both", "between", "do","once","these","off",
                          #"on", "to", "and", "with","you",
                          "through","what", "when", "while","which", "where"]
for i in stop_words_to_put_back:
    en_stopwords.remove(i)

# Create function for filtering dataframe
def rightTypes(ngram):
    # remove stop words
    for word in ngram:
        if word in en_stopwords:
            return False
    
    # no conjunctions
    non_acceptable_types = ('CONJ')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in non_acceptable_types or tags[1][1] in non_acceptable_types:
        return False
    else:
        return True

## WIP 
Function for bigrams comparison table across different methods

In [115]:
def specific_word_bigram(dat, specific_word, min_filter = 10, max_rows = 200):

    # Count words in text. Preprocessing for filtering out text of word count = 1 so language detector can work
    dat["num_words"] = dat["text"].apply(lambda x: len(re.findall(r'\w+', str(x))))

    # Drop all rows where text is only 1 word
    dat = dat[dat["num_words"] >1].reset_index(drop = True)

    #function to remove non-ascii characters (otherwise it will throw error in langauge detection)
    def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

    dat["text"] = dat["text"].map(removeNonAscii)

    # Detect language
    from polyglot.detect import Detector
    dat["detector"] = dat["text"].apply(lambda x: Detector(x, quiet = True))

    import icu
    dat["language"] = dat["detector"].apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
    dat["confidence"] = dat["detector"].apply(lambda x: x.language.confidence)

    # Filter for only english tweets
    dat = dat[dat["language"] == "English"].reset_index(drop = True)

    # Combine all list of tokens into one list, but segment between with new line escape "\n" to separate out tweet 
    # (otherwise introduce false signal between last word of each tweet and first word of the next tweet for bigrams)
    list_str = list(dat["text"])
    list_str_tot = "\n".join(list_str)

    # Perform cleaning function
    clean_tokens = clean_text(list_str_tot)

    import nltk

    # Bigrams measures and Bigrams Finder
    bigrams_measures = nltk.collocations.BigramAssocMeasures()
    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(clean_tokens)

    # Create white list
    whitelist_filter = lambda *w: specific_word not in w
    bigramFinder.apply_ngram_filter(whitelist_filter)

    # Min bigram occurences
    bigramFinder.apply_freq_filter(min_filter)

    # Different methods for scoring bigrams
    list_student = bigramFinder.nbest(bigrams_measures.student_t, max_rows)
    list_chi = bigramFinder.nbest(bigrams_measures.chi_sq, max_rows)
    list_likelihood = bigramFinder.nbest(bigrams_measures.likelihood_ratio, max_rows)
    list_pmi = bigramFinder.nbest(bigrams_measures.pmi, max_rows)

    df = pd.DataFrame([list_pmi, list_student, list_chi, list_likelihood]).T

    df.columns = ["PMI", "T-test", "Chi-Square", "Likelihood Ratio"]
    
    return df

# Read in all the data from all years

In [116]:
dat_2019 = pd.read_csv("./output_scrape/output_2019.csv")
dat_2018 = pd.read_csv("./output_scrape/output_2018.csv")
dat_2017 = pd.read_csv("./output_scrape/output_2017.csv")
dat_2016 = pd.read_csv("./output_scrape/output_2016.csv")

In [117]:
specific_bigrams_2019 = specific_word_bigram(dat_2019, specific_word = "integration", max_rows = 200)
specific_bigrams_2018 = specific_word_bigram(dat_2018, specific_word = "integration", max_rows = 200)
specific_bigrams_2017 = specific_word_bigram(dat_2017, specific_word = "integration", max_rows = 200)
specific_bigrams_2016 = specific_word_bigram(dat_2016, specific_word = "integration", max_rows = 200)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

## Saving the bigrams comparison into Excel

In [119]:
list_sheet_titles = ["2016", "2017", "2018", "2019"]
list_dat_comparisons = [specific_bigrams_2016, specific_bigrams_2017, specific_bigrams_2018, specific_bigrams_2019]

specific_word = "integration"

with pd.ExcelWriter("./results/bigrams/specific_word_("+specific_word+")_bigrams_ranking.xlsx") as writer:
    for i in range(len(list_sheet_titles)):
    
        list_dat_comparisons[i].to_excel(writer, sheet_name = list_sheet_titles[i])