In [None]:
# KUC, NLP

2. LDiA

In [None]:
import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# original data
sms = pd.read_csv('sms-spam.csv',usecols= ['spam','text'])
print(sms.head())

In [None]:
# processed
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]  # <2>
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
mask = sms.spam.astype(bool).values
sms['spam'] = sms.spam.astype(int)

print(sms.head())

In [None]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize.casual import casual_tokenize

n_samples = 4837
n_features = 1000
n_components = 10
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
tfidf = tfidf_vectorizer.fit_transform(sms.text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()


In [None]:
#
lda_model = LatentDirichletAllocation(n_components)
topics = lda_model.fit_transform(tfidf)

top_terms = 20
TOTAL_TOPICS = 20
vocabulary = np.array(tfidf_vectorizer.get_feature_names())
topic_terms = lda_model.components_

In [None]:
print(topic_terms.shape)
print(topic_terms[:top_terms])

In [None]:
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]

In [None]:
print(topic_keyterms)
print(topic_keyterms[0])

In [None]:
# topics = [', '.join(topic) for topic in topic_keyterms]

In [None]:
print(topics.shape)

In [None]:
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics)
print(topics_df.head(5))


In [None]:
# another implementation with more visualization

lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)

lda.fit(tfidf)

In [None]:
import matplotlib.pyplot as plt
# plot_top_words, see sklearn documentation: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    # plt.show()
    plt.savefig('topics.png')

In [None]:
%matplotlib inline
plot_top_words(lda, tfidf_feature_names, n_top_words, "Topics in LDA model")