# Text Analytics | BAIS:6100
# Module 4: Keyword Analysis and Visualization

Instructor: Kang-Pyo Lee 

Topics to be covered:
- Popular keyword ranking
- Word clouds

## Loading Data

In [None]:
hashtag = "covid19"

In [None]:
N = 500

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

months = ["202012", "202011", "202010", "202009", "202008", "202007", 
          "202006", "202005", "202004", "202003", "202002", "202001"]

df = pd.DataFrame()
for month in months:
    dftmp = pd.read_csv("classdata/tweets/tweets_{}_{}.csv".format(hashtag, month), sep="\t", quoting=3)
    
    ##############################################
    # Create a random sample of N rows.
    ##############################################
    if len(dftmp) > N:
        dftmp = dftmp.sample(n=N)
    ##############################################
    
    df = pd.concat([df, dftmp])
    print("{}: {:,}".format(month, len(dftmp)))

print("Total number of tweets in df: {:,}\n".format(len(df)))

df.user_name = df.user_name.astype(str)
df.text = df.text.astype(str)

df

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

## Adding New Columns

In [None]:
import gender_guesser.detector as gender
import nltk
from textblob import TextBlob

In [None]:
df["words"] = df.text.apply(lambda x: nltk.word_tokenize(x))
df["tagged_words"] = df.words.apply(lambda x: nltk.pos_tag(x))

def predict_gender(detector, name):
    if len(name.split()) == 0:
        return "unknown"
    
    first_name = name.split()[0]
    
    if first_name.startswith("Mr"):
        return "male"
    if first_name.startswith("Ms") | first_name.startswith("Mrs") | first_name.startswith("Miss"):
        return "female"
    
    user_gender = detector.get_gender(first_name)
    
    if user_gender == "mostly_female":
        return "female"
    elif user_gender == "mostly_male":
        return "male"
    
    return user_gender

d = gender.Detector(case_sensitive=False)
df["user_gender"] = df.user_name.apply(lambda x: predict_gender(d, x))

In [None]:
df

## Popular Keyword Rankings

In [None]:
df[["tagged_words"]]

In [None]:
from collections import Counter

collections.Counter: https://docs.python.org/3/library/collections.html#collections.Counter

In [None]:
counter = Counter()          # The counter object will have all the word count information

for l in df.tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()  # Convert all words to lowercase
        word_set.add(word)
            
    counter.update(word_set) # Be aware of the indentation!

Note that you converted each word to lowercase to avoid any potential variations due to the case sensitivity. 

Note also that you used a set to remove duplicate words in a tweet. This allows you to count each word in a tweet as just 1, even if it appears more than once in the tweet. 

In [None]:
from IPython.display import Image
Image("classdata/images/word_count.png")

In [None]:
counter.most_common(50)   # Show the top-n most popular words in counter

collections.Counter.most_common: https://docs.python.org/3/library/collections.html#collections.Counter.most_common

Stopwords are words that are filtered out before processing of natural language text. Stopwords are generally the most common words in a language. There is no single universal list of stopwords used by all natural language processing tools, and indeed not all tools even use such a list. Some tools avoid removing stop words to support phrase search.

Stopwords on Wikipedia: https://en.wikipedia.org/wiki/Stop_words

In [None]:
from nltk.corpus import stopwords

In [None]:
global_stopwords = nltk.corpus.stopwords.words("english") 
global_stopwords[:30]

Stopwords, in fact, have no meaning in terms of keyword analysis.

In [None]:
counter = Counter()

for l in df.tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        
        if word not in global_stopwords:   # Check if the word is a stopword
            word_set.add(word)
            
    counter.update(word_set)
    
counter.most_common(30)

In [None]:
import string
string.punctuation

In [None]:
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt', 
                   'covid19', 'coronavirus']
local_stopwords

Note that the words *health*, *mental*, and *mentalhealth* were included in the local stopwords, as they are search terms so every tweet must have any of those words, which makes them not so meaningful in terms of keyword analysis. 

In [None]:
counter = Counter()
for l in df.tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        
        if word not in (global_stopwords + local_stopwords):
        # Check if the word is either a global or a local stopword
            word_set.add(word)
            
    counter.update(word_set)
    
counter.most_common(30)

In [None]:
counter

In [None]:
counter["vaccine"]

The word '*vaccine*' appears 179 times in the corpus. 

In [None]:
[(word, count) for word, count in counter.items() if count > 100]

### Popular Adjectives 

In [None]:
counter2 = Counter()
for l in df.tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        tag = t[1]
        
        if tag.startswith("JJ") & (word not in (global_stopwords + local_stopwords)):
            word_set.add(word)
            
    counter2.update(word_set)
    
counter2.most_common(30)

### Popular Verbs

In [None]:
counter3 = Counter()
for l in df.tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        tag = t[1]
        
        if tag.startswith("VB") & (word not in (global_stopwords + local_stopwords)):
            word_set.add(word)
            
    counter3.update(word_set)
    
counter3.most_common(30)

### Popular Keywords among Women

In [None]:
df.user_gender == "female"

In [None]:
counter4 = Counter()
for l in df[df.user_gender == "female"].tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        if word not in (global_stopwords + local_stopwords):
            word_set.add(word)
            
    counter4.update(word_set)
    
counter4.most_common(30)

### Popular Keywords among Men

In [None]:
counter5 = Counter()
for l in df[df.user_gender == "male"].tagged_words:
    word_set = set()
    
    for t in l:
        word = t[0].lower()
        if word not in (global_stopwords + local_stopwords):
            word_set.add(word)
            
    counter5.update(word_set)
    
counter5.most_common(30)

## Putting Everything Together into a Single Function

In [None]:
def get_counter(dataframe, stopwords=[], target_tag=None):
    counter = Counter()
    
    for l in dataframe.tagged_words:
        word_set = set()

        for t in l:
            word = t[0].lower()
            tag = t[1]

            ##########################################################
            # Check if the word is a stopword.
            ##########################################################
            if word in stopwords:
                continue

            if target_tag is None:
                word_set.add(word)
            else:
                ##########################################################
                # Check the tag
                ##########################################################
                if tag.startswith(target_tag):
                    word_set.add(word)
                else:
                    continue

        counter.update(word_set)     # Be aware of the indentation!
        
    return counter

## Word Clouds

In [None]:
# ! pip install --user --upgrade wordcloud

In [None]:
from wordcloud import WordCloud 

WordCloud for Python: https://amueller.github.io/word_cloud/

In [None]:
counter_overall = get_counter(df, global_stopwords+local_stopwords)
counter_overall.most_common(30)

In [None]:
wc = WordCloud(background_color="white", max_words=100, width=800, height=500)
wc.generate_from_frequencies(counter_overall)
wc.to_file("outcome/wordcloud_overall.png")
Image(filename="outcome/wordcloud_overall.png")

The frequency of each word is shown with font size: the larger the font size is, the more frequently the word occurs. This format is useful for quickly perceiving the most prominent words to determine its relative prominence.

In [None]:
def draw_wordcloud(counter, image_file_name, max_words=100):
    wc = WordCloud(background_color="white", max_words=max_words, width=800, height=500)
    wc.generate_from_frequencies(counter)
    wc.to_file(image_file_name)
    display(Image(filename=image_file_name))

In [None]:
draw_wordcloud(counter_overall, "outcome/wordcloud_overall.png", 100)

In [None]:
counter_adj = get_counter(df, global_stopwords+local_stopwords, "JJ")
counter_adj.most_common(30)

In [None]:
draw_wordcloud(counter_adj, "outcome/wordcloud_adj.png", 100)

In [None]:
counter_verb = get_counter(df, global_stopwords+local_stopwords, "VB")
counter_verb.most_common(30)

In [None]:
draw_wordcloud(counter_verb, "outcome/wordcloud_verb.png", 100)

In [None]:
counter_women = get_counter(df[df.user_gender=="female"], global_stopwords+local_stopwords)
counter_women.most_common(30)

In [None]:
draw_wordcloud(counter_women, "outcome/wordcloud_women.png", 100)

In [None]:
counter_men = get_counter(df[df.user_gender=="male"], global_stopwords+local_stopwords)
counter_men.most_common(30)

In [None]:
draw_wordcloud(counter_men, "outcome/wordcloud_men.png", 100)

In [None]:
set_women = set([word for word, count in counter_women.most_common(100)])
set_men = set([word for word, count in counter_men.most_common(100)])

In [None]:
set_women - set_men   # Words that only appear on women's top-100 words

In [None]:
set_men - set_women   # Words that only appear on men's top-100 words

## Exercises - Keyword Analysis and Visualization