<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/foxbook_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Latent Dirichlet Allocation

In [None]:
# Scikit-Learn solution:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

class SklearnTopicModels(object):
  def __init__(self, n_topics=50):
    '''
    n_topics is the desired number of topics
    '''
    self.n_topics = n_topics
    self.model = Pipeline([
                           ('norm', TextNormalizer()),
                           ('vect', CountVectorizer(tokenizer=identity,
                                                    preprocessor=None,
                                                    lowercase=False)),
                           ('model', LatentDirichletAllocation(n_topics=self.n_topics))
    ])

  def fit_transform(self, documents):
    self.model.fit_transform(documents)
    return self.model

  def get_topics(self, n=25):
    '''
    n is the number of top terms to show for each topic
    '''
    vectorizer = self.model.named_steps['vect']
    model = self.model.steps[-1][1]
    names = vectorizer.get_feature_names()
    topics = dict()

    for idx, topic in enumerate(model.components_):
      features = topic.argsort()[:-(n - 1): -1]
      tokens = [names[i] for i in features]
      topics[idx] = tokens
    
    return topics

if __name__ == '__main__':
  corpus = PickledCorpusReader('corpus/')

  lda = SklearnTopicModels()
  documents = corpus.docs()

  lda.fit_transform(documents)
  topics = lda.get_topics()
  for topic, terms in topics.items():
    print('Topic #{}:'.format(topic+1))
    print(terms)

In [None]:
# gensim implementation
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

  def __init__(self, dirpath='.', tofull=False):
    '''
    pass in a directory that holds the lexicon in corpus.dict and the
    tf-idf model in tfidf.model.

    set tofull=True if the next thing is a Scikit-Learn estimator
    otherwise keep False if the next thing is a Gensim model.
    '''
    self._lexicon_path = os.path.join(dirpath, 'corpus.dict')
    self._tfidf_path = os.path.join(dirpath, 'tfidf.model')

    self.lexicon = None
    self.tfidf = None
    self.tofull = tofull

    self.load()

  def load(self):
    if os.path.exists(self._lexicon_path):
      self.lexicon = Dictionary.load(self._lexicon_path)

    if os.path.exists(self._tfidf_path):
      self.tfidf = TfidsModel().load(self._tfidf_path)

  def save(self):
    self.lexicon.save(self._lexicon_path)
    self.tfidf.save(self._tfidf_path)

  def fit(self, documents, labels=None):
    self.lexicon = Dictionary(documents)
    self.tfidf = TfidfModel([
                             self.lexicon.doc2bow(doc)
                             for doc in documents],
                            id2word = self.lexicon)
    self.save()
    return self

  def transform(self, documents):
    def generator():
      for document in documents:
        vec = self.tfidf[self.lexicon.doc2bow(document)]
        if self.tofull:
          yield sparse2full(vec)
        else:
          yield vec
    return list(generator())

from gensim.sklearn_api import ldamodel

class GensimTopicModels(object):

  def __init__(self, n_topics = 50):
    '''
    n_topics is the desired number of topics
    '''
    self.n_topics = n_topics
    self.model = Pipeline([
                          ('norm', TextNormalizer()),
                          ('vect', GensimTfidfVectorizer()),
                          ('model', ldamodel.LdaTransformer(num_topics = self.n_topics))
    ])

  def fit(self, documents):
    self.model.fit(documents)
    return self.model

if __name__ == '__main__':
  corpus = PickledCorpusReader('../corpus')
  gensim_lda = GensimTopicModels()
  docs = [
          list(corpus.docs(fileids = fileid))[0]
          for fileid in corpus.fileids()
  ]

  gensim_lda.fit(docs)

def get_topics(vectorized_corpus, model):
  from operator import itemgetter
  topics = [
            max(model[doc], key=itemgetter(1))[0]
            for doc in vectorized_corpus
  ]
  return topics

lda = gensim_lda.model.named_steps['model'].gensim_model

corpus = [
          gensim_lda.model.named_steps['vect'].lexicon.doc2bow(doc)
          for doc in gensim_lda.model.named_steps['norm'].transform(docs)
]

topics = get_topics(corpus, lda)

for topic, doc in zip(topics, docs):
  print('Topic: {}'.format(topic))
  print(doc)

In [1]:
# Visualizing the gensim solution:
import pyLDAvis
import pyLDAvis.gensim

lda = gensim_lda.model.named_steps['model'].gensim_model

corpus = [
          gensim_lda.model.named_steps['vect'].lexicon.doc2bow(doc)
          for doc in gensim_lda.model.named_steps['norm'].transform(docs)
]

lexicon = gensim_lda.model.named_steps['vect'].lexicon

data = pyLDAvis.gensim.prepare(model, corpus, lexicon)
pyLDAvis.display(data)

### Latent Semantic Analysis

In [None]:
# Scikit-Learn option:
class SklearnTopicModels(object):

  def __init__(self, n_topics = 50, estimator='LDA'):
    '''
    n_topics is the desired number of topics
    to use latent semantic analysis, set estimator to 'LSA',
    otherwise, defaults to latent dirichlet allocation (lda)
    '''
    self.n_topics = n_topics

    if estimator == 'LSA':
      self.estimator = TruncatedSVD(n_components=self.n_topics)
    else:
      self.estimator = LatentDirichletAllocation(n_topics = self.n_topics)

    self.model = Pipeline([
                           ('norm', TextNormalizer()),
                           ('tfidf', CountVectorizer(tokenizer=identity,
                                                     preprocessor=None,
                                                     lowercase=False)),
                           ('model', self.estimator)
    ])

In [None]:
# the gensim way
from gensim.sklearn_api import lsimodel, ldamodel

class GensimTopicModels(object):

  def __init__(self, n_topics=50, estimator='LDA'):
    '''
    n_topics is the desired number of topics
    to use latent semantic analysis, set estimator to 'LSA'
    otherwise defaults to latent dirichlet allocation
    '''
    self.n_topics = n_topics

    if estimator == 'LSA':
      self.estimator = lsimodel.LsiTransformer(num_topiics=self.n_topics)
    else:
      self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics)

    self.model = Pipeline([
                           ('norm', TextNoralizer()),
                           ('vect', GensimTfidfVectorizer()),
                           ('model', self.estimator)
    ])

### Non-Negative Matrix Factorization

In [None]:
from sklearn.decomposition import NMF

class SklearnTopicModels(object):

  def __init__(self, n_topics=50, estimator='LDA'):
    '''
    n_topics is the desired number of topics
    to use latent semantic analysis, set estimator to 'LSA'
    to use non-negative matrix factorization, set estimator to 'NMF
    otherwise, defaults to latent dirichlet allocation
    '''
    self.n_topics = n_topics
    
    if estimator == 'LSA':
      self.estimator = TruncatedSVD(n_components=self.n_topics)
    elif estimator == 'NMF':
      self.estimator = NMF(n_components=self.n_topics)
    else:
      self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

    self.model = Pipeline([
                           ('norm', TextNormalizer()),
                           ('tfidf', CountVectorizer(tokenizer=identity,
                                                     preprocessor=None,
                                                     lowercase=False)),
                           ('model', self.estimator)
    ])