# Text Analytics | BAIS:6100
# Module 9: Text Clustering and Topic Modeling

Instructor: Kang-Pyo Lee 

In [None]:
# ! pip install --user --upgrade pyldavis

## Loading Data

In [None]:
hashtag = "covid19"

In [None]:
N = 500

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

months = ["202012", "202011", "202010", "202009", "202008", "202007", 
          "202006", "202005", "202004", "202003", "202002", "202001"]

df = pd.DataFrame()
for month in months:
    dftmp = pd.read_csv("classdata/tweets/tweets_{}_{}.csv".format(hashtag, month), sep="\t", quoting=3)
    
    ##############################################
    # Create a random sample of N rows.
    ##############################################
    if len(dftmp) > N:
        dftmp = dftmp.sample(n=N)
    ##############################################
    
    df = pd.concat([df, dftmp])
    print("{}: {:,}".format(month, len(dftmp)))

print("Total number of tweets in df: {:,}\n".format(len(df)))

df.user_name = df.user_name.astype(str)
df.text = df.text.astype(str)

df

In [None]:
df.text.value_counts()

Most of the duplicates in a tweet dataset are retweets. 

In [None]:
df = df.drop_duplicates(["text"], keep="first").copy()     # Add .copy() to avoid the SettingWithCopyWarning

In [None]:
df.text.value_counts()

In [None]:
df.info()

## Setting the Goal

Our goal is to build a <b>clustering</b> model that is able to find clusters of similar tweets. 
- Feature variables: words in tweet texts
- Records          : documents (tweets)

In [None]:
from IPython.display import Image
Image("classdata/images/clustering.png")

## Preparing Data for Modeling

In [None]:
from nltk.corpus import stopwords
import string 

global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt', 
                   'covid19', 'coronavirus', 'covid19…', 'covid']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df.text)

The words in the document-term matrix are used as features of the model and the documents are used as individual records of the model. Recall that there is no `y`, or the outcome variable, in unsupervised learning. Note also that we do not split the dataset into a training set and a test set in unsupervised learning.  

In [None]:
X.shape

There are 5,140 documents, or records, and 16,538 words, or features. 

## K-Means Clustering

### Step 1. Choose the number of clusters

In [None]:
k = 5

### Step 2. Initialize a model object for k-means clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=k, random_state=0)
kmeans

sklearn.cluster.KMeans: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

### Step 3. Fit the model using the input data

In [None]:
%time kmeans.fit(X)

### Step 4. Examine the clustering outcome

The focus of this step should be on identifying the characteristics of each cluster. 

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.cluster_centers_.shape

Each value in the `kmeans.cluster_centers_` array is the coordinates of a centroid in a multi-dimensional space with about 16,500 dimensions. 

In [None]:
kmeans.labels_

In [None]:
df["cluster"] = kmeans.labels_

In [None]:
df[["text", "cluster"]]

In [None]:
df.cluster.value_counts()

Note that k-means clustering neither names the clusters nor gives any additional information about the clusters. It just yields cluster labels in numbers. It is you to identify each cluster.  

In [None]:
counts = df.cluster.value_counts()
counts

In [None]:
counts.max(), counts.idxmax()    # the largest cluster

In [None]:
counts.min(), counts.idxmin()    # the smallest cluster 

In [None]:
df[df.cluster == counts.idxmax()].sample(10, random_state=0)[["text", "cluster"]]     # the largest cluster

In [None]:
df[df.cluster == counts.idxmin()].sample(10, random_state=0)[["text", "cluster"]]     # the smallest cluster 

In [None]:
import nltk
df["words"] = df.text.apply(lambda x: nltk.word_tokenize(x))
df["tagged_words"] = df.words.apply(lambda x: nltk.pos_tag(x))

from collections import Counter

def get_counter(dataframe, stopwords=[]):
    counter = Counter()
    
    for l in dataframe.tagged_words:
        word_set = set()

        for t in l:
            word = t[0].lower()
            tag = t[1]

            if word not in stopwords:
                word_set.add(word)
            
        counter.update(word_set)
        
    return counter

In [None]:
counter_max = get_counter(df[df.cluster == counts.idxmax()], global_stopwords+local_stopwords)
counter_max.most_common(30)

In [None]:
counter_min = get_counter(df[df.cluster == counts.idxmin()], global_stopwords+local_stopwords)
counter_min.most_common(30)

## LDA Topic Modeling

## Setting the Goal

Our goal is to build a <b>topic</b> model that is able to find abstract topics in the tweets. 
- Feature variables: words in tweet texts
- Records          : documents (tweets)

In [None]:
global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt', 
                   'covid19', 'coronavirus', 'covid19…', 'covid', 'co']

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df.text)

### Step 1. Choose the number of topics

In [None]:
num_topics = 5

### Step 2. Initialize a model object for LDA topic modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=num_topics, random_state=0)     # LDA uses randomness to get a probability distribution
lda

sklearn.decomposition.LatentDirichletAllocation: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

### Step 3. Fit the model using the input data

In [None]:
%time lda.fit(X)

### Step 4. Examine the output of topic modeling

In [None]:
lda.components_

In [None]:
lda.components_.shape

In [None]:
def show_topics(model, feature_names, num_top_words):
    for topic_idx, topic_scores in enumerate(model.components_):
        print("***Topic {}:".format(topic_idx))
        print(" + ".join(["{:.2f} * {}".format(topic_scores[i], feature_names[i]) for i in topic_scores.argsort()[::-1][:num_top_words]]))
        print()

In [None]:
show_topics(lda, vectorizer.get_feature_names(), 10)

Note that, just like k-means clustering, LDA topic modeling does not name the topics. It is up to you to identify each topic with its contributing words.  

### Topic Model Visualization

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis: https://github.com/bmabey/pyLDAvis

In [None]:
pyLDAvis.sklearn.prepare(lda, X, vectorizer)

- Saliency: a measure of how much the term tells you about the topic.
- Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.
- Bubble size: the importance of the topics, relative to the data.

## Exercises - Text Clustering and Topic Modeling