In [155]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # will display full text in row
from collections import defaultdict
import numpy as np
import os, re, string
from time import time
from clean_text import clean_text
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Get Covid Tweet dataset

In [95]:
# Download data from https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv
train = pd.read_csv('../datasets/Corona_NLP_train.csv', encoding='ISO-8859-1')
test = pd.read_csv('../datasets/Corona_NLP_test.csv', encoding='ISO-8859-1')
print(train.shape)
print(test.shape)

(41157, 6)
(3798, 6)


### Combine train and test, since we're using an unsupervised model

In [96]:
data = pd.concat([train, test], axis=0)
del train, test

In [97]:
# Restrict to useful columns
data = data[['Location', 'TweetAt', 'OriginalTweet']].rename(columns={'TweetAt': 'Date', 'OriginalTweet': 'Tweet'})

In [98]:
# Shuffle data for good measure
data = data.sample(frac=0.01, random_state=123).reset_index(drop=True)

In [99]:
data.head()

Unnamed: 0,Location,Date,Tweet
0,"London, UK",22-03-2020,"QPAY cuts POS prices by 50% to help Qatari SMEs fight CoVid-19: QPAY International, a member of the NEXXO Network, the leading financial technology (Fintech) company in Qatar servicing over 15,000 Qatari SmallÂ ... https://t.co/ufT6VKKXNd #fintech"
1,"Chicago, IL",18-03-2020,"According to a Mariano's employee who just got out of work and on my bus, a fight broke out in the grocery store, glass was broken. Stay safe y'all. #coronavirus #COVID19"
2,London,08-04-2020,Just had a text that reads: \r\r\n\r\r\nCONGRATULATIONS. YOU ARE NOW CLEAR TO LEAVE YOUR HOME AT ANY TIME AND LICK SUPERMARKET TROLLEY HANDLES. \r\r\n\r\r\nREGARDS BORIS JOHNSON\r\r\n\r\r\nPretty sure itÂs from my first wife. #LOCKDOWN #coronavirus
3,The World,25-03-2020,ArenÂt the prices of some vital products @costco extremely high? @USDAFoodSafety @USATODAY @LACountyDCBA @MayorOfLA @LANow #food #socal #coronavirus #COVID2019
4,"Auckland, New Zealand",22-03-2020,Security guards at the supermarket. WTF is wrong with people that requires guards to protect the poor checkout staff #Covid_19


In [100]:
data.shape

(450, 3)

## Clean data

### Fix tweets
I remove html, URLs, punctuation, hashtags, emoticons, convert contractions, remove stop words, lematize words, and convert everything to lowercase.

In [156]:
# Standardizd (somewhat) version of word `coronavirus`
def standardize_words(string):
    string = string.replace('COVID-19', 'coronavirus')
    string = string.replace("COVID 19" , 'coronavirus')
    string = string.replace("Covid-19", 'coronavirus')
    string = string.replace('COVID?19', 'coronavirus')
    string = string.replace('covid', 'coronavirus')
    string = string.replace('COVID', 'coronavirus')
    string = string.replace('Covid_19', 'coronavirus')
    string = string.replace('COVID2019', 'coronavirus')
    string = string.replace('coronavirus19', 'coronavirus')
    string = string.replace('COVID', 'coronavirus')
    string = string.replace('covid', 'coronavirus')
    string = string.replace('tp', 'toilet paper')
    string = string.replace('TP', 'toilet paper')
    return string

In [157]:
processed = pd.DataFrame(data['Tweet'].apply(lambda x: standardize_words(x)).rename('clean'))

In [158]:
start = time()
processed = pd.DataFrame(clean_text().run(processed['clean'], no_stop_words=True, 
                                          remove_punctuation=True, lemmatize=True).rename('clean'))
print('Total time:', round(time() - start, 0), 'seconds')

Total time: 3.0 seconds


In [159]:
processed.head()

Unnamed: 0,clean
0,qpay cut po price fifty help qatari smes fight covid nineteen qpay international member nexxo network lead financial technology fintech company qatar service zero qatari small httoilet paper
1,accord mariano employee get work bus fight broke grocery store glass broken stay safe
2,text read congratulation clear leave home time lick supermarket trolley handle regard boris johnson pretty sure first wife
3,price vital product extremely high
4,security guard supermarket wtf wrong people require guard protect poor checkout staff


#### Distribution of word frequencies in corpus

In [82]:
def count_word_freq(series):
    '''
    Counts word frequency across all documents (rows) in a pd.Series.
    :param series: pd.Series
    :returns: pd.DataFrame, where index is each unique word in corpus and column is count of
        the occurrence of that word across all documents.
    '''
    temp = [i.split() for i in series] 
    freq = defaultdict(int)  # Get freq of each word across all documents
    for indiv_doc in temp:
        for token in indiv_doc:
            freq[token] += 1
        
    word_freq = pd.DataFrame.from_dict(freq, orient='index')\
        .reset_index().rename(columns={'index': 'word', 0: 'freq'})
    return word_freq

In [104]:
word_freq = count_word_freq(processed['clean'])

In [106]:
print(word_freq['freq'].describe())

count    2516.000000
mean        2.955087
std         6.839888
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       141.000000
Name: freq, dtype: float64


In [107]:
print(f"Number of words that appear only once in corpus: {len(word_freq[word_freq['freq']==1])}")

Number of words that appear only once in corpus: 1529


In [161]:
print('Some examples of rare words:')
word_freq[word_freq['freq']==1]['word'].head(10)

Some examples of rare words:


2                po
7              smes
11    international
13            nexxo
14          network
17       technology
18          fintech
20            qatar
25          mariano
30            broke
Name: word, dtype: object

### Fix date

In [None]:
processed['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y', errors='coerce').dt.strftime("%Y-%m-%d")

In [None]:
print(processed['Date'].isnull().mean())

In [None]:
# Date range
for i in sorted(processed['Date'].unique()):
    print(i)

### Fix location

In [None]:
print(data['Location'].nunique())
print(data['Location'].isnull().mean())

In [None]:
data['Location'].head(10)

In [None]:
foo = data['Location'].str.replace("[^a-zA-Z ]", '', regex=True)
foo = foo.replace(np.NaN, '') # convert missings to empty strings
foo = foo.apply(lambda x: ' '.join(x.split())) # remove excess whitespace from some rows

In [None]:
states = pd.read_csv('../datasets/US States.csv')
print(states.head(5))

In [None]:
states_dict = dict(zip(states['Abbreviation'].tolist(), states['State'].tolist()))

In [None]:
# Lookup function to replace state abbreviations with state names
def lookup_replace(col, dict_map):
    '''
    '''
    new = []
    for i in range(len(col)):
        tmp = []
        try:
            for word in col.iloc[i].split():
                if word in dict_map.keys():
                    name = dict_map[word]
                else:
                    name = word
                tmp.append(name)
        except AttributeError:
            tmp = np.NaN
        new.append(tmp)
    return pd.Series(new).apply(lambda x: ' '.join(x))

In [None]:
foo = lookup_replace(foo, states_dict)

In [None]:
foo.sample(n=20)

## Keywords using pretrained BERT model and cosine similarity

[Credit](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea)

In [40]:
%%capture
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [127]:
def bert_keywords(col, n_keywords=5, ngram_range=(1, 1)):
    '''
    Uses pretrained BERT model from sentence-transformer to generate nth keywords per document.
    :param col:, pd.Series, input (cleaned) text, one document per row
    :param n_keywords: int, number of top keywords per document to return.
    :param ngram_range: tuple, ngram range, e.g. (1,1) is monogram, (2,2) is bigram.
    :returns: list of lists, where each sublist contains top keywords per document
    '''
    keyword_list = []
    for i in range(len(processed)):
        if (i % 200==0 and i > 0):
            print(i)
        try:
            count = CountVectorizer(ngram_range=ngram_range).fit(processed['clean'].iloc[i:i+1].tolist()) # pd.Series per row/document
            candidates = count.get_feature_names()
            doc_embedding = model.encode(processed['clean'].iloc[i:i+1].tolist())
            candidate_embeddings = model.encode(candidates)

            # Get keywords using cosine similarity
            top_n = n_keywords
            distances = cosine_similarity(doc_embedding, candidate_embeddings)
            keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
        except ValueError: # empty string, only stop words or not intelligible
            keywords = [np.NaN]
        keyword_list.append(keywords)
    return keyword_list

In [153]:
start = time()
keywords = bert_keywords(processed['clean'])
print('Total time:', round(time() - start, 0), 'seconds')

200
400
Total time: 57.0 seconds


In [114]:
processed['bert_keywords'] = pd.Series(keywords)

In [123]:
processed.tail()

Unnamed: 0,clean,bert_keywords
445,bi bulletin five deal recent turbulence experienced emerge market economy bi org,"[market, emerge, five, turbulence, economy]"
446,go supermarket emerald busiest see live area much cheese except specialty cheese pasta tin tomato flour tp tissue run kitty litter,"[flour, tomato, supermarket, pasta, cheese]"
447,challenge uncertain time crucial vermonter protect exploitative scam consumer abuse learn coronavirus related scam currently circulation respond,"[exploitative, abuse, vermonter, scam, coronavirus]"
448,coronavirus antidote home test toilet paper sanitizers mask movie myth amp tip gt gt gt coronavirus,"[sanitizers, antidote, movie, toilet, coronavirus]"
449,forget thing stock tea live without morning cuppa coronavirus,"[stock, without, forget, coronavirus, tea]"


In [124]:
start = time()
keywords = bert_keywords(processed['clean'], ngram_range=(2, 2))
print('Total time:', round(time() - start, 0), 'seconds')

200
400
Total time: 77.0 seconds


## Keyword comparison to TF-IDF

[Term frequency-inverse document frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) is a statistic reflecting a given word's importance to a particular document in a collection of documents (i.e. a corpus). This statistic is bounded between 0 and 1, with higher scores indicating a given word is comparably rarer (i.e. more salient) in a particular document. This metric is commonly used to extract keywords about a text.

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [144]:
start = time()
vectors = vectorizer.fit_transform(processed['clean'].tolist()) # scipy.sparse.csr.csr_matrix
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
print('Total time:', round(time() - start, 0), 'seconds')

Total time: 0.0 seconds


In [145]:
# Lookup dict for each document's words
# Note - sklearn's TfidfVectorizer smoothes the IDF function, resulting in fewer words per document than orig
start = time()
tfidf_dict = {}
tfidf_values = []
for doc in range(len(denselist)):
    positions = [idx for idx, val in enumerate(denselist[doc]) if val > 0] # get index if word in document, i.e. > 0 tf-idf
    values = [val for val in denselist[doc] if val > 0] # tf-idf values in doc
    words = [feature_names[i] for i in positions] # words themselves
    tfidf_dict[doc] = dict(zip(words, values))
    tfidf_values += values # TF-IDF values for all words in all documents
print('Total time:', round(time() - start, 0), 'seconds')

Total time: 0.0 seconds


In [152]:
# Top 5 most important words according to TF-IDF of first document
dict(sorted(tfidf_dict[0].items(), key=lambda item: item[1], reverse=True)[:5])

{'qatari': 0.40592678911401797,
 'qpay': 0.40592678911401797,
 'fintech': 0.20296339455700899,
 'international': 0.20296339455700899,
 'network': 0.20296339455700899}

#### Restrict to top *n* keywords

In [148]:
def top_n_keywords(dictionary, n=5):
    '''
    Restricts to top n most important words per document, returning this as pd.Series
    :param dictionary: dict, tf-idf dictionary where each subdictionary pertains to an individual document
    :param n: int, number of top keywords to limit to
    :returns: pd.Series
    '''
    key_wds = []
    for i in range(len(dictionary)):
        key_wds.append(list(dict(sorted(dictionary[i].items(), key=lambda item: item[1], reverse=True)[:n]).keys()))
    return pd.Series(key_wds)

In [150]:
processed['tfidf_keywords'] = top_n_keywords(tfidf_dict, n=5)

In [154]:
processed.head()

Unnamed: 0,clean,bert_keywords,tfidf_keywords
0,qpay cut po price fifty help qatari smes fight covid nineteen qpay international member nexxo network lead financial technology fintech company qatar service zero qatari small,"[financial, qatar, qatari, fifty, fintech]","[qatari, qpay, fintech, international, network]"
1,accord mariano employee get work bus fight broke grocery store glass broken stay safe,"[broken, fight, broke, grocery, bus]","[broke, broken, mariano, bus, glass]"
2,text read congratulation clear leave home time lick supermarket trolley handle regard boris johnson pretty sure first wife,"[home, congratulation, trolley, supermarket, wife]","[congratulation, pretty, text, boris, clear]"
3,price vital product extremely high,"[product, price, extremely, vital, high]","[extremely, vital, product, high, price]"
4,security guard supermarket wtf wrong people require guard protect poor checkout staff,"[poor, guard, security, checkout, supermarket]","[guard, checkout, wtf, poor, require]"
