# Text Analytics | BAIS:6100
# Module 9: Text Clustering and Topic Modeling - Exercises

Instructor: Kang-Pyo Lee 

## Loading Data

Twitter hashtag options:
- ai
- bitcoin
- blacklivesmatter
- bts
- covid19
- fakenews
- innovation
- mentalhealth
- metoo
- startup

Choose a Twitter hashtag you're interested in and save it in the `hashtag` variable below.

In [None]:
# Your answer here
hashtag = "mentalhealth"

In [None]:
N = 500

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

months = ["202012", "202011", "202010", "202009", "202008", "202007", 
          "202006", "202005", "202004", "202003", "202002", "202001"]

df = pd.DataFrame()
for month in months:
    dftmp = pd.read_csv("classdata/tweets/tweets_{}_{}.csv".format(hashtag, month), sep="\t", quoting=3)
    
    ##############################################
    # Create a random sample of N rows.
    ##############################################
    if len(dftmp) > N:
        dftmp = dftmp.sample(n=N)
    ##############################################
    
    df = pd.concat([df, dftmp])
    print("{}: {:,}".format(month, len(dftmp)))

print("Total number of tweets in df: {:,}\n".format(len(df)))

df.user_name = df.user_name.astype(str)
df.text = df.text.astype(str)

df

Check if there are any duplicates in the `text` column.

In [None]:
df.text.value_counts()

Remove duplicates in the `text` column, if any. 

In [None]:
df = df.drop_duplicates(["text"], keep="first").copy()

In [None]:
df.text.value_counts()

In [None]:
df.info()

## Preparing Data for Modeling

In [None]:
from nltk.corpus import stopwords
import string 

global_stopwords = stopwords.words("english")

Define your own local stopwords. 

In [None]:
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df.text)

In [None]:
X.shape

## K-Means Clustering

### Step 1. Choose the number of clusters

Set the number of clusters you would like to get. 

In [None]:
# Your answer here
k = 5

### Step 2. Initialize a model object for k-means clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=k, random_state=0)
kmeans

### Step 3. Fit the model using the input data

In [None]:
%time kmeans.fit(X)

### Step 4. Examine the clustering outcome

In [None]:
kmeans.labels_

In [None]:
df["cluster"] = kmeans.labels_

In [None]:
df[["text", "cluster"]]

Check the number of values for each cluster. 

In [None]:
df.cluster.value_counts()

Check 10 random texts from the largest cluster. (Replace the cluster number as needed.)

In [None]:
df[df.cluster == 0].sample(10, random_state=0)[["text", "cluster"]]

Check 10 random texts from the second largest cluster. (Replace the cluster number as needed.)

In [None]:
df[df.cluster == 4].sample(10, random_state=0)[["text", "cluster"]]

In [None]:
import nltk
df["words"] = df.text.apply(lambda x: nltk.word_tokenize(x))
df["tagged_words"] = df.words.apply(lambda x: nltk.pos_tag(x))

from collections import Counter

def get_counter(dataframe, stopwords=[]):
    counter = Counter()
    
    for l in dataframe.tagged_words:
        word_set = set()

        for t in l:
            word = t[0].lower()
            tag = t[1]

            if word not in stopwords:
                word_set.add(word)
            
        counter.update(word_set)
        
    return counter

Check the top-30 most common keywords in the largest cluster. (Replace the cluster number as needed.)

In [None]:
counter0 = get_counter(df[df.cluster == 0], global_stopwords+local_stopwords)
counter0.most_common(30)

Check the top-30 most common keywords in the second largest cluster. (Replace the cluster number as needed.)

In [None]:
counter4 = get_counter(df[df.cluster == 4], global_stopwords+local_stopwords)
counter4.most_common(30)

## LDA Topic Modeling

### Step 1. Choose the number of topics

Set the number of topics you would like to get. 

In [None]:
# Your answer here
num_topics = 5

### Step 2. Initialize a model object for LDA topic modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=num_topics, random_state=0)
lda

### Step 3. Fit the model using the input data

In [None]:
%time lda.fit(X)

### Step 4. Examine the output of topic modeling

In [None]:
lda.components_

In [None]:
lda.components_.shape

In [None]:
def show_topics(model, feature_names, num_top_words):
    for topic_idx, topic_scores in enumerate(model.components_):
        print("***Topic {}:".format(topic_idx))
        print(" + ".join(["{:.2f} * {}".format(topic_scores[i], feature_names[i]) for i in topic_scores.argsort()[::-1][:num_top_words]]))
        print()

In [None]:
show_topics(lda, vectorizer.get_feature_names(), 10)

### Topic Model Visualization

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.sklearn.prepare(lda, X, vectorizer)