In [1]:
import string
import pandas as pd
import numpy as np
from polyglot.detect import Detector
import re
import warnings
warnings.filterwarnings(action = "ignore")
import icu
import nltk

## Punctuation, URL and text removal/replacements but without stopwords removal

In [2]:
import re
import string
from nltk.tokenize import word_tokenize

# Function for cleaning text by removing punctuation only
def clean_text(text):
    # remove all urls as identified by "http" and "www". 
    # "\S+" removes all non-whitespace characters as part of the url until the end of url as defined by whitespace
    # https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet/24399874
    regex_url = re.compile(r"http\S+"+"|"
                           +"www\S+"+"|"
                          +"\S+\.com\S+")
    dirty_text = regex_url.sub(" ", str(text).lower())
    
    # Replace "i'm" with "i am"
    regex_im = re.compile(r"i'm ")
    dirty_text = regex_im.sub("i am ", str(dirty_text))
    
    # Replace "i'll" with "i will"
    regex_im = re.compile(r"i'll ")
    dirty_text = regex_im.sub("i will ", str(dirty_text))
    
    # Replace "let's" with "let us"
    regex_lets = re.compile(r"let's ")
    dirty_text = regex_lets.sub("let us ", str(dirty_text))
    
    # Replace "it's" with "it is"
    regex_its = re.compile(r"it's ")
    dirty_text = regex_its.sub("it is ", str(dirty_text))
    
    # Replace "there's" with "there is"
    regex_theres = re.compile(r"there's ")
    dirty_text = regex_theres.sub("there is ", str(dirty_text))
    
    # Replace "don't" with "do not"
    regex_dont = re.compile(r"don't ")
    dirty_text = regex_dont.sub("do not ", str(dirty_text))
    
    # Replace digits and digits with one instance of comma formatting
    regex_digits = re.compile(r"\d*[,]+\d*[+]?"+"|"+
                              "\d+")
    dirty_text = regex_digits.sub(" ", str(dirty_text))
    
    # Replace twitter tags
    regex_twittertags = re.compile(r"#\S+")
    dirty_text = regex_twittertags.sub(" ", str(dirty_text))
    
     # Replace twitter handles
    regex_twitterhandles = re.compile(r"@\S+")
    dirty_text = regex_twitterhandles.sub(" ", str(dirty_text))
    
    # remove punc except new lines "\n"
    regex_punct = re.compile('['+ re.escape(string.punctuation) + '\\r\\t]')
    nopunct = regex_punct.sub(" ", str(dirty_text))
    
    tokens = word_tokenize(nopunct.lower())
    
    return tokens

## Text Cleaning Function

In [3]:
from nltk.corpus import stopwords
#nltk.download('averaged_perceptron_tagger')

#get english stopwords
en_stopwords = set(stopwords.words('english'))
# add "zapier" and "..." to stopwords
en_stopwords.add("zapier")
en_stopwords.add("…")

stop_words_to_put_back = ["after", "any", "both", "between", "do","once","these","off",
                          #"on", "to", "and", "with","you",
                          "through","what", "when", "while","which", "where"]
for i in stop_words_to_put_back:
    en_stopwords.remove(i)

# Create function for filtering dataframe
def rightTypes(ngram):
    # remove stop words
    for word in ngram:
        if word in en_stopwords:
            return False
    
    # no conjunctions
    non_acceptable_types = ('CONJ')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in non_acceptable_types or tags[1][1] in non_acceptable_types:
        return False
    else:
        return True

## Function for bigrams comparison table across different methods

In [14]:
def create_bigrams_compare(dat, min_filter = 10, table_rows = 50): 

    # Count words in text. Preprocessing for filtering out text of word count = 1 so language detector can work
    dat["num_words"] = dat["text"].apply(lambda x: len(re.findall(r'\w+', str(x))))

    # Drop all rows where text is only 1 word
    dat = dat[dat["num_words"] >1].reset_index(drop = True)
    
    #function to remove non-ascii characters (otherwise it will throw error in langauge detection)
    def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
    
    dat["text"] = dat["text"].map(removeNonAscii)

    # Detect language
    from polyglot.detect import Detector
    dat["detector"] = dat["text"].apply(lambda x: Detector(x, quiet = True))

    import icu
    dat["language"] = dat["detector"].apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
    dat["confidence"] = dat["detector"].apply(lambda x: x.language.confidence)

    # Filter for only english tweets
    dat = dat[dat["language"] == "English"].reset_index(drop = True)

    # Combine all list of tokens into one list, but segment between with new line escape "\n" to separate out tweet 
    # (otherwise introduce false signal between last word of each tweet and first word of the next tweet for bigrams)
    list_str = list(dat["text"])
    list_str_tot = "\n".join(list_str)

    # Perform cleaning function
    clean_tokens = clean_text(list_str_tot)
    
    import nltk

    # Bigrams finder
    bigrams = nltk.collocations.BigramAssocMeasures()
    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(clean_tokens)

    # Frequency
    bigram_freq = bigramFinder.ngram_fd.items()
    bigramFreqTable = pd.DataFrame(list(bigram_freq), columns = ["bigram", "freq"]).sort_values(by="freq", ascending = False)

    clean_bigramFreqTable = bigramFreqTable[bigramFreqTable["bigram"].map(lambda x: rightTypes(x))].reset_index(drop = True)

    # PMI
    bigrams = nltk.collocations.BigramAssocMeasures()
    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(clean_tokens)

    # Filter out bigram frequencies
    bigramFinder.apply_freq_filter(min_filter)

    bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

    clean_bigramPMITable = bigramPMITable[bigramPMITable["bigram"].map(lambda x: rightTypes(x))].reset_index(drop = True)

    # T-test
    bigramTtestTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

    clean_bigramTtestTable = bigramTtestTable[bigramTtestTable["bigram"].map(lambda x: rightTypes(x))].reset_index(drop = True)

    # Chi Square
    bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

    clean_bigramChiTable = bigramChiTable[bigramChiTable["bigram"].map(lambda x: rightTypes(x))].reset_index(drop = True)

    # Likelihood Ratio
    bigramLikeTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

    clean_bigramLikeTable = bigramLikeTable[bigramLikeTable["bigram"].map(lambda x: rightTypes(x))].reset_index(drop = True)

    # Bigrams Comparison
    bigramsCompare = pd.DataFrame([clean_bigramFreqTable["bigram"][:table_rows].values, clean_bigramPMITable["bigram"][:table_rows].values, clean_bigramTtestTable["bigram"][:table_rows].values, clean_bigramChiTable["bigram"][:table_rows].values, clean_bigramLikeTable["bigram"][:table_rows].values]).T

    bigramsCompare.columns = ['Frequency', 'PMI', 'T-test', 'Chi-Sq Test', 'Likelihood Ratio Test']

    return bigramsCompare

# Read in all the data from all years

In [15]:
dat_2019 = pd.read_csv("./output_scrape/output_2019.csv")
dat_2018 = pd.read_csv("./output_scrape/output_2018.csv")
dat_2017 = pd.read_csv("./output_scrape/output_2017.csv")
dat_2016 = pd.read_csv("./output_scrape/output_2016.csv")

In [16]:
bigrams_compare_2019 = create_bigrams_compare(dat_2019, table_rows = 200)
bigrams_compare_2018 = create_bigrams_compare(dat_2018, table_rows = 200)
bigrams_compare_2017 = create_bigrams_compare(dat_2017, table_rows = 200)
bigrams_compare_2016 = create_bigrams_compare(dat_2016, table_rows = 200)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [17]:
bigrams_compare_2016

Unnamed: 0,Frequency,PMI,T-test,Chi-Sq Test,Likelihood Ratio Test
0,"(launches, digest)","(numerous, improvements)","(launches, digest)","(derek, bolen)","(messages, where)"
1,"(messages, where)","(derek, bolen)","(messages, where)","(numerous, improvements)","(launches, digest)"
2,"(app, new)","(dated, sep)","(app, new)","(dick, dale)","(stress, free)"
3,"(powerful, tools)","(greatest, hits)","(powerful, tools)","(cutting, expenses)","(chrome, extension)"
4,"(marketing, stack)","(sap, jam)","(marketing, stack)","(inbound, unboxed)","(powerful, tools)"
5,"(stress, free)","(paper, notebooks)","(stress, free)","(maximum, volume)","(marketing, stack)"
6,"(do, lists)","(maximum, volume)","(do, lists)","(volume, flex)","(do, lists)"
7,"(lists, work)","(volume, flex)","(lists, work)","(cross, posting)","(lists, work)"
8,"(chrome, extension)","(background, noise)","(chrome, extension)","(feeling, awkward)","(ultimate, guide)"
9,"(remote, work)","(curation, machine)","(remote, work)","(background, noise)","(life, easier)"


In [18]:
bigrams_compare_2017

Unnamed: 0,Frequency,PMI,T-test,Chi-Sq Test,Likelihood Ratio Test
0,"(life, easier)","(activedemand, supercharges)","(life, easier)","(endurance, athletes)","(life, easier)"
1,"(team, accounts)","(endurance, athletes)","(team, accounts)","(esight, eyewear)","(launches, month)"
2,"(month, team)","(employer, spotlight)","(launches, month)","(fitting, square)","(team, accounts)"
3,"(launches, month)","(esight, eyewear)","(month, team)","(silicon, valley)","(month, team)"
4,"(easier, w)","(fitting, square)","(easier, w)","(square, pegs)","(easier, w)"
5,"(these, tools)","(square, pegs)","(these, tools)","(shave, club)","(rss, feed)"
6,"(w, these)","(western, hemisphere)","(w, these)","(net, promoter)","(shave, club)"
7,"(app, workflows)","(hole, fillers)","(lead, gen)","(brute, forced)","(lead, gen)"
8,"(lead, gen)","(mon, jun)","(app, workflows)","(christopher, peters)","(monthly, shave)"
9,"(rss, feed)","(intense, pace)","(rss, feed)","(imposter, syndrome)","(wade, foster)"


In [19]:
bigrams_compare_2018

Unnamed: 0,Frequency,PMI,T-test,Chi-Sq Test,Likelihood Ratio Test
0,"(new, integration)","(warehouse, architect)","(new, integration)","(mlsp, funnelizer)","(new, integration)"
1,"(marketing, tools)","(wi, fi)","(marketing, tools)","(wi, fi)","(revenue, leaks)"
2,"(revenue, leaks)","(consultant, converts)","(revenue, leaks)","(saleswings, unleashes)","(biggest, revenue)"
3,"(biggest, revenue)","(emily, breuninger)","(biggest, revenue)","(akun, bersihin)","(social, media)"
4,"(live, without)","(translate, indexing)","(live, without)","(benerin, akun)","(live, without)"
5,"(leaks, via)","(spiro, loves)","(leaks, via)","(jasa, upfoll)","(million, customers)"
6,"(app, integrations)","(rescue, shelters)","(google, sheets)","(axis, lms)","(saleswings, unleashes)"
7,"(google, sheets)","(cognitive, bias)","(app, integrations)","(uncanny, automator)","(google, sheets)"
8,"(million, customers)","(keyword, extraction)","(million, customers)","(revenue, leaks)","(segment, workato)"
9,"(social, media)","(neil, swear)","(social, media)","(warehouse, architect)","(achieve, unicorn)"


In [20]:
bigrams_compare_2019

Unnamed: 0,Frequency,PMI,T-test,Chi-Sq Test,Likelihood Ratio Test
0,"(kph, mph)","(dell, boomi)","(kph, mph)","(kph, mph)","(kph, mph)"
1,"(metres, feet)","(jenny, bloom)","(metres, feet)","(mlsp, funnelizer)","(metres, feet)"
2,"(elevation, gain)","(storyteller, pairs)","(km, miles)","(metres, feet)","(km, miles)"
3,"(average, speed)","(jessica, greene)","(elevation, gain)","(km, miles)","(elevation, gain)"
4,"(km, miles)","(marcus, blankenship)","(average, speed)","(elevation, gain)","(average, speed)"
5,"(feet, link)","(highlight, reel)","(feet, link)","(traction, conf)","(feet, link)"
6,"(done, km)","(keyword, extraction)","(done, km)","(axis, lms)","(done, km)"
7,"(ride, done)","(decision, matrix)","(ride, done)","(marcus, blankenship)","(ride, done)"
8,"(google, sheets)","(reel, delivered)","(google, sheets)","(average, speed)","(google, sheets)"
9,"(new, integration)","(mlsp, funnelizer)","(social, media)","(san, francisco)","(social, media)"


## Saving the bigrams comparison into Excel

In [21]:
list_sheet_titles = ["2016", "2017", "2018", "2019"]
list_dat_comparisons = [bigrams_compare_2016, bigrams_compare_2017, bigrams_compare_2018, bigrams_compare_2019]


with pd.ExcelWriter("./results/bigrams/compiled_bigrams.xlsx") as writer:
    for i in range(len(list_sheet_titles)):
    
        list_dat_comparisons[i].to_excel(writer, sheet_name = list_sheet_titles[i])