# `pyLDAvis.sklearn`

pyLDAvis now also supports LDA application from scikit-learn. Let's take a look into this in more detail. We will be using the 20 newsgroups dataset as provided by scikit-learn.

In [2]:
from __future__ import print_function
# low alpha = few topics per document
# low beta = words only for 1 topic
# https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
pyLDAvis.enable_notebook()

  """
  """


## Load 20 newsgroups dataset

First, the 20 newsgroups dataset available in sklearn is loaded. As always, the headers, footers and quotes are removed.

In [4]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroups.data
print(len(docs_raw))

11314


## Convert to document-term matrix

Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form.

In [5]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(11314, 9144)


In [6]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)



(11314, 9144)


## Fit Latent Dirichlet Allocation models

Finally, the LDA models are fitted.

In [7]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Visualizing the models with pyLDAvis

In [9]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [11]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
documents = ["doc1.txt", "doc2.txt", "doc3.txt"] 
  
# raw documents to tf-idf matrix: 
vectorizer = TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True)
# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components=100,         
                         algorithm='randomized',
                         n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', tf_vectorizer), 
                            ('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(docs_raw)

# svd_matrix can later be used to compare documents, compare words, or compare queries with documents

In [17]:
svd_matrix

array([[ 7.22793083e-04,  5.86043875e-01,  6.30905604e-01, ...,
         1.72305042e-01, -5.23484426e-01,  2.27769338e-01],
       [ 8.50516193e-04,  4.98409644e-01,  1.62308762e-01, ...,
         3.23844682e-01, -7.18724706e-02, -4.81802300e-03],
       [ 4.34847845e-03,  3.25515266e+00,  2.38313937e+00, ...,
         4.95448367e-01, -4.99324166e-01,  5.49694378e-01],
       ...,
       [ 8.81933419e-04,  4.51032147e-01,  4.29552436e-01, ...,
         9.30565858e-02, -2.52169676e-01,  2.08274530e-01],
       [ 3.58620559e-04,  3.09080822e-01,  2.17120843e-01, ...,
        -9.09955907e-02,  9.32525086e-02,  1.23445240e-01],
       [ 4.49867895e-04,  2.76532422e-01,  9.49148300e-02, ...,
         2.62067265e-01,  6.91775115e-02, -1.20577848e-01]])

In [18]:
import umap

  _defmatcher = re.compile('def\s+(\w+)\(.*')
  re_longest_white_prefix = re.compile('^\s*')
  """
  """


In [19]:
X_topics = svd_matrix
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = dataset.target,
s = 10, # size
edgecolor='none'
)
plt.show()

NameError: name 'plt' is not defined

In [1]:
import matplotlib.pyplot as plt
plt.show()