In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # will display full text in row
from collections import defaultdict
import numpy as np
import os, re, string
from time import time
import tensorflow as tf
import nltk
#nltk.download(['punkt', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'], quiet=True)
from clean_text import clean_text
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## Get Covid Tweet dataset

In [2]:
# Download data from https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv
train = pd.read_csv('../datasets/Corona_NLP_train.csv', encoding='ISO-8859-1')
test = pd.read_csv('../datasets/Corona_NLP_test.csv', encoding='ISO-8859-1')
print(train.shape)
print(test.shape)

(41157, 6)
(3798, 6)


### Combine train and test, since we're using an unsupervised model

In [3]:
data = pd.concat([train, test], axis=0)
del train, test

In [4]:
# Restrict to useful columns
data = data[['Location', 'TweetAt', 'OriginalTweet']].rename(columns={'TweetAt': 'Date', 'OriginalTweet': 'Tweet'})

In [5]:
# Shuffle data for good measure
data = data.sample(frac=1.0, random_state=999).reset_index(drop=True)

In [None]:
data.head()

In [None]:
data.shape

## Clean data

### Fix tweets
I remove html, URLs, punctuation, hashtags, emoticons, convert contractions, lemmatize words, and convert everything to lowercase

In [6]:
# Standardizd (somewhat) version of word `coronavirus`
def standardize_covid(string):
    string = string.replace('COVID-19', 'coronavirus')
    string = string.replace("COVID 19" , 'coronavirus')
    string = string.replace("Covid-19", 'coronavirus')
    string = string.replace('COVID?19', 'coronavirus')
    string = string.replace('covid', 'coronavirus')
    string = string.replace('COVID', 'coronavirus')
    string = string.replace('Covid_19', 'coronavirus')
    string = string.replace('COVID2019', 'coronavirus')
    string = string.replace('coronavirus19', 'coronavirus')
    string = string.replace('COVID', 'coronavirus')
    string = string.replace('covid', 'coronavirus')
    return string

In [7]:
processed = pd.DataFrame(data['Tweet'].apply(lambda x: standardize_covid(x)).rename('clean'))

In [8]:
start = time()
processed = pd.DataFrame(clean_text().run(processed['clean'], no_stop_words=False, 
                                          remove_punctuation=True, lemmatize=True).rename('clean'))
print('Total time:', round(time() - start, 0), 'seconds')

Total time: 179.0 seconds


In [9]:
processed.head()

Unnamed: 0,clean
0,yeah imagine that economy in much well shape not fudge data to hide ongoing degrowrth oil price well correlate with crude price and of course no pogrom against muslim in kashmir and elsewhere and respond to covid one
1,side effect of hedge be cause massive cost for airline which be now oblige to pay oil price accord to pre agree contract despite not need almost any oil at all agree price be always way high than current price
2,thank you to those on the frontline of coronavirus the cleaner driver supermarket assistant and so many many more
3,distillery have switch portion of their production from alcohol to hand sanitizer to help with
4,i just come from the supermarket a line wspace betw customer wait to enter store the new normal our life compartmentalize human solidarity kill by psychology of social distance break the distance and you get curse oh it will stay


#### Remove words only present once in corpus (e.g. misspellings)

In [10]:
def count_word_freq(series):
    '''
    Counts word frequency across all documents (rows) in a pd.Series.
    :param series: pd.Series
    :returns: pd.DataFrame, where index is each unique word in corpus and column is count of
        the occurrence of that word across all documents.
    '''
    temp = [i.split() for i in series] 
    freq = defaultdict(int)  # Get freq of each word across all documents
    for indiv_doc in temp:
        for token in indiv_doc:
            freq[token] += 1
        
    word_freq = pd.DataFrame.from_dict(freq, orient='index')\
        .reset_index().rename(columns={'index': 'word', 0: 'freq'})
    return word_freq

In [11]:
word_freq = count_word_freq(processed['clean'])

In [12]:
print(word_freq['freq'].describe())

count    31378.000000
mean        40.534292
std        619.113152
min          1.000000
25%          1.000000
50%          1.000000
75%          5.000000
max      49109.000000
Name: freq, dtype: float64


In [13]:
print(f"Number of words that appear only once in corpus: {len(word_freq[word_freq['freq']==1])}")

Number of words that appear only once in corpus: 16731


In [14]:
print('Some examples of rare words:')
word_freq[word_freq['freq']==1]['word'].head(20)

Some examples of rare words:


14               degrowrth
24                  pogrom
90                  wspace
91                    betw
100       compartmentalize
171           asiegercares
270                colleys
304               usatoday
307            mecklenburg
334          centurytowers
341                  recul
345          consommateurs
360              coincides
384             wondrously
447          lockdownghana
487               tastiest
488             healthiest
639                  penal
647    restartingrebooting
668           aboutmissing
Name: word, dtype: object

In [15]:
processed['clean'] = clean_text().remove_infreq_words(processed['clean'], 1)

In [16]:
count_word_freq(processed['clean']).describe()

Unnamed: 0,freq
count,14647.0
mean,85.693589
std,904.070711
min,2.0
25%,2.0
50%,5.0
75%,18.0
max,49109.0


### Fix date

In [17]:
processed['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y', errors='coerce').dt.strftime("%Y-%m-%d")

In [18]:
print(processed['Date'].isnull().mean())

0.0


In [19]:
# Date range
for i in sorted(processed['Date'].unique()):
    print(i)

2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31
2020-04-01
2020-04-02
2020-04-03
2020-04-04
2020-04-05
2020-04-06
2020-04-07
2020-04-08
2020-04-09
2020-04-10
2020-04-11
2020-04-12
2020-04-13
2020-04-14


### Fix location

In [20]:
print(data['Location'].nunique())
print(data['Location'].isnull().mean())

13127
0.20963185407629853


In [32]:
data['Location'].head(10)

0    New Delhi, India
1                   ?
2                 NaN
3            LES, NYC
4                 NYC
5                 NaN
6                 NaN
7            VA ?? MI
8    Hyderabad, India
9      Lagos, Nigeria
Name: Location, dtype: object

In [22]:
foo = data['Location'].str.replace("[^a-zA-Z ]", '', regex=True)
foo = foo.replace(np.NaN, '') # convert missings to empty strings
foo = foo.apply(lambda x: ' '.join(x.split())) # remove excess whitespace from some rows

In [23]:
states = pd.read_csv('../datasets/US States.csv')
print(states.head(5))

            State Abbreviation
0         ALABAMA           AL
1          ALASKA           AK
2  AMERICAN SAMOA           AS
3         ARIZONA           AZ
4        ARKANSAS           AR


In [24]:
states_dict = dict(zip(states['Abbreviation'].tolist(), states['State'].tolist()))

In [25]:
# Lookup function to replace state abbreviations with state names
def lookup_replace(col, dict_map):
    '''
    '''
    new = []
    for i in range(len(col)):
        tmp = []
        try:
            for word in col.iloc[i].split():
                if word in dict_map.keys():
                    name = dict_map[word]
                else:
                    name = word
                tmp.append(name)
        except AttributeError:
            tmp = np.NaN
        new.append(tmp)
    return pd.Series(new).apply(lambda x: ' '.join(x))

In [27]:
foo = lookup_replace(foo, states_dict)

In [31]:
foo.sample(n=20)

21716                          
22832                          
9239             United Kingdom
41709        Brisbane Australia
14625             Merton London
16608               Los Angeles
26339                 London UK
14640              Izmir Turkey
7951     Lower mainland Toronto
18864                    London
6559        Nashville TENNESSEE
7084      Sacramento CALIFORNIA
31427                          
16566                Appalachia
43380                          
41403             Beijing China
3083            Phoenix ARIZONA
15739                          
37662      Greer SOUTH CAROLINA
26643                          
dtype: object

## TF-IDF

[Term frequency-inverse document frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) is a statistic reflecting a given word's importance to a particular document in a collection of documents (i.e. a corpus). This statistic is bounded between 0 and 1, with higher scores indicating a given word is comparably rarer (i.e. more salient) in a particular document. This metric is commonly used to extract keywords about a text.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
start = time()
vectors = vectorizer.fit_transform(processed['clean'].tolist()) # scipy.sparse.csr.csr_matrix
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
print('Total time:', round(time() - start, 0), 'seconds')

In [None]:
# Lookup dict for each document's words
# Note - sklearn's TfidfVectorizer smoothes the IDF function, resulting in fewer words per document than orig
start = time()
tfidf_dict = {}
tfidf_values = []
for doc in range(len(denselist)):
    positions = [idx for idx, val in enumerate(denselist[doc]) if val > 0] # get index if word in document, i.e. > 0 tf-idf
    values = [val for val in denselist[doc] if val > 0] # tf-idf values in doc
    words = [feature_names[i] for i in positions] # words themselves
    tfidf_dict[doc] = dict(zip(words, values))
    tfidf_values += values # TF-IDF values for all words in all documents
print('Total time:', round(time() - start, 0), 'seconds')

In [None]:
tfidf_dict[0]

In [None]:
# Top 5 most important words according to TF-IDF of first document
dict(sorted(tfidf_dict[0].items(), key=lambda item: item[1], reverse=True)[:5])

In [None]:
len(tfidf_values)

### Distribution of TF-IDF values

In [None]:
sns.kdeplot(tfidf_values)
plt.xlabel('TF-IDF Value')
plt.title("Distribution of TF-IDF Values in Coronavirus Dataset")
plt.show()

In [None]:
pd.Series(tfidf_values).describe()

### Restrict to top *n* keywords

In [None]:
def top_n_keywords(dictionary, n=10):
    '''
    Restricts to top n most important words per document, returning this as pd.Series
    :param dictionary: dict, tf-idf dictionary where each subdictionary pertains to an individual document
    :param n: int, number of top keywords to limit to
    :returns: pd.Series
    '''
    key_wds = []
    for i in range(len(dictionary)):
        key_wds.append(list(dict(sorted(dictionary[i].items(), key=lambda item: item[1], reverse=True)[:n]).keys()))
    return pd.Series(key_wds)

In [None]:
processed['tfidf_top10'] = top_n_keywords(tfidf_dict, n=10)

In [None]:
def top_n_values(dictionary, n=10):
    '''
    Gets TF-IDF values among top n most important words
    :dictionary: dict, tf-idf dictionary where each 
    '''
    vals = []
    for i in range(len(dictionary)):
        vals += list(dict(sorted(dictionary[i].items(), key=lambda item: item[1], reverse=True)[:n]).values())
    return pd.Series(vals)

In [None]:
top_n_values(tfidf_dict, n=10).describe()

### Filter top keywords using minimum TF-IDF score

As indicated above, some of the top 10 keywords have low TF-IDF scores, meaning few of the words in a given review are meaningful according to this metric. To remove these, the function below defines keywords based on a percentile of the TF-IDF distribution

In [None]:
def filter_top_keywords(dictionary, filter_val=0.2, max_keywords=10):
    filtered_words = []
    for i in range(len(dictionary)):
        words = [key for key, val in dictionary[i].items() if val > filter_val]
        if len(words) == 0: # Take top words if none meet threshold
            words = list(dict(sorted(dictionary[i].items(), key=lambda item: item[1], reverse=True)[:1]).keys())
        words = words[:max_keywords]
        filtered_words.append(words)
    return pd.Series(filtered_words)

In [None]:
filter_val = np.percentile(pd.Series(tfidf_values), 90)
processed['tfidf_90pct'] = filter_top_keywords(tfidf_dict, filter_val=filter_val)

In [None]:
processed['tfidf_90pct'].apply(lambda x: len(x)).describe()